包含以下内容:
- sgemv_k32_f32_kernel
- sgemv_k128_f32x4_kernel
- sgemv_k16_f32_kernel
- PyTorch bindings
# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada
python3 sgemv.py
输出:
--------------------------------------------------------------------------------
out_k32f32: [-0.49123383, -13.83110714, -9.43372917], time:0.00372529ms
out_k128f32x4: [-0.49123383, -13.83110905, -9.43372917], time:0.00376225ms
out_f32_th: [-0.49123335, -13.83110809, -9.43372917], time:0.00836253ms
--------------------------------------------------------------------------------
out_k16f32: [-1.54411626, 9.39481068, 1.68226683], time:0.00364780ms
out_f32_th: [-1.54411626, 9.39480972, 1.68226647], time:0.00812173ms
--------------------------------------------------------------------------------