Skip to content

Commit

Permalink
gpu: jit: gemm: performant fp8 strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
kealan-barbieri authored and karturov committed Apr 10, 2024
1 parent 668abae commit 068f850
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/gpu/jit/gemm/kernel.db
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
*******************************************************************************/

/*@kcatalog@*/
kcatalog::FlatCatalog<959> _CATALOG_
{1, 8271, 959, {
kcatalog::FlatCatalog<967> _CATALOG_
{1, 8271, 967, {
{{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
{{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
{{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}},
Expand Down Expand Up @@ -819,9 +819,17 @@ kcatalog::FlatCatalog<959> _CATALOG_
{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABopqI"}, "am64+S1,64@128 av64+B64@128 aS cs di sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}},
{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "oI"}, "aS32 aS32 aB sys grf256 cab2 wg 4x4 ek l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}},
{{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 8x4 cab3 ks32 nse hi pt sr bk0 grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02524e+06, 922411, 0, 0, 4.21888e+06, 8.7081e+06, 0.917542, 0.658478, 0.919692, 1.40366, 0.0167399, 0.0167399, 0, 1, 1.08933, 0.991295, 6.92853e-13}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav di hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav di hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 86016, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "A"}, "aB8+S1,16@24 aS16+S32@16 aB wg 2x2x8 kr vav hi pt sr sb256 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav di hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav di hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav di hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m64@48 am32+m32@64 aB wg 8x4 xaf vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06,1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIqp"}, "at16+m64@48 am32+m32@56 aB wg 4x8 xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 32}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {885722, 718272, 0, 0, 8.48691e+06, 1.26566e+07, 1.01026, 0.782981, 0.918959, 1.54496, 0.00414471, 0.00414471, 0, 1, 2.17926, 1.31409, 2.23082e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "AI"}, "at16+m32@48 aB64 aB wg 1x4x8 kr af vav di li pt sr br sb64 bk0 sm dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.15217e+06, -88485.5, -9852.59, 144015,2.69517e+06, 1.69656e+06, 1.08886, 0.414631, 0.587045, 1.19593, 0.0202405, 0.0220064, 0.0155006, 1, 1.0809, 0.644827, 3.81346e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+S1,16@24 aS16+S32@16 aB wg 2x2x8 kr vav hi pt sr sb256 bk0 sm sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.2289e+06, -127728, -18531.3, 192240, 3.35053e+06, 0, 0.932716, 1.33521, 0.665104, 1.39923, 0.0628179, 0.0675437, 0.0114361, 0.999809, 1.27564, 0.821381, 3.76564e-11}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav di hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 32}, {16, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 65536, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}},
{{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav di wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}},
{{'F', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 aB nse di wg 4x8 bo pt sb256 kc8 bk0 sr", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "AB"}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse di hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}},
Expand Down

0 comments on commit 068f850

Please sign in to comment.