diff --git a/Dockerfile b/Dockerfile index 81618b20ae..3e6bd99554 100644 --- a/Dockerfile +++ b/Dockerfile @@ -78,7 +78,8 @@ RUN cget -p $PREFIX init --cxx /opt/rocm/llvm/bin/clang++ --std=c++14 -DAMDGPU_T # Install dependencies RUN cget -p $PREFIX install pfultz2/rocm-recipes # Install a newer version of cmake for libMLIRMIOpen -RUN cget -p $PREFIX install kitware/cmake@v3.13.4 +RUN cget -p $PREFIX install kitware/cmake@v3.15.1 + ADD min-requirements.txt /min-requirements.txt RUN CXXFLAGS='-isystem $PREFIX/include' cget -p $PREFIX install -f /min-requirements.txt RUN cget -p $PREFIX install danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f @@ -92,19 +93,15 @@ RUN pip install -r /doc-requirements.txt RUN if [ "$USE_TARGETID" = "ON" ] ; then export HIPCC_LINK_FLAGS_APPEND='-O3 -parallel-jobs=4' && export HIPCC_COMPILE_FLAGS_APPEND='-O3 -Wno-format-nonliteral -parallel-jobs=4' && rm /usr/bin/hipcc; fi # install last released miopentensile in default (master), install latest commits when MIOTENSILE_VER="latest" (develop) -RUN if [ "$USE_TARGETID" = "OFF" ] ; then echo "MIOpenTensile is not installed."; elif [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@be26d30d3d7509a414134a45f4a6d49e5da250b8; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@4bfe00a8de61d12862d9fa803b8ea9a981a50f97; fi +RUN if [ "$USE_TARGETID" = "OFF" ] ; then echo "MIOpenTensile is not installed."; elif [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@4fda8d57c6b088333b0392ba0617b0d6eec5d5b7; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@403fc13acb8518c3f82a79dc501b21ef1751e470; fi RUN cd ~ && \ - export MLIR_COMMIT=bbce2f3216e013efe59d7e9c021b4896f89176b0 && \ + export MLIR_COMMIT=44abc4783fe2f6b4415871f7c44aa52ab89bccab && \ wget https://github.com/ROCmSoftwarePlatform/llvm-project-mlir/archive/$MLIR_COMMIT.tar.gz && \ tar -xvzf $MLIR_COMMIT.tar.gz && \ rm -rf $MLIR_COMMIT.tar.gz && \ cd llvm-project-mlir-$MLIR_COMMIT && mkdir -p build && cd build && \ - $PREFIX/bin/cmake -G "Unix Makefiles" ../llvm \ - -DLLVM_ENABLE_PROJECTS="mlir;lld" \ - -DCMAKE_BUILD_TYPE=Release \ - -DBUILD_SHARED_LIBS=OFF \ - -DLLVM_BUILD_LLVM_DYLIB=OFF \ - -DLLVM_ENABLE_TERMINFO=OFF && \ + $PREFIX/bin/cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_FAT_LIBMLIRMIOPEN=1 && \ make -j$(nproc) libMLIRMIOpen && \ + $PREFIX/bin/cmake --install . --component libMLIRMIOpen --prefix /opt/rocm && \ cd ~ && rm -rf llvm-project-mlir-$MLIR_COMMIT diff --git a/Jenkinsfile b/Jenkinsfile index 625e16b392..eec4aa77dc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -183,7 +183,11 @@ pipeline { defaultValue: true, description: "") booleanParam( - name: "SMOKE_TESTS", + name: "SMOKE_FP32_AUX1", + defaultValue: true, + description: "") + booleanParam( + name: "SMOKE_FP16_BF16_INT8", defaultValue: true, description: "") booleanParam( @@ -280,7 +284,7 @@ pipeline { } } stage("Smoke Fp32"){ - when { expression { params.SMOKE_TESTS } } + when { expression { params.SMOKE_FP32_AUX1 } } parallel{ stage('Fp32 OpenCL Debug') { agent{ label rocmnode("vega") } @@ -341,7 +345,7 @@ pipeline { } } stage("Smoke Aux 1"){ - when { expression { params.SMOKE_TESTS } } + when { expression { params.SMOKE_FP32_AUX1 } } parallel{ stage('Fp32 HipNoGPU Debug') { agent{ label rocmnode("nogpu") } @@ -511,7 +515,7 @@ pipeline { } } stage("Smoke Fp16/Bf16/Int8"){ - when { expression { params.SMOKE_TESTS } } + when { expression { params.SMOKE_FP16_BF16_INT8 } } parallel{ stage('Fp16 Hip Vega20 /opt/rocm') { agent{ label rocmnode("vega20") } @@ -557,7 +561,7 @@ pipeline { agent{ label rocmnode("gfx908") } steps{ script{ - runDockerJob(flags: '-DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug', prefixpath: '/opt/rocm', gpu_arch: "gfx908") + runDockerJob(flags: '-DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug', prefixpath: '/opt/rocm', gpu_arch: "gfx908") } } } @@ -572,7 +576,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -588,7 +592,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -620,7 +624,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -760,7 +764,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -776,7 +780,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -792,7 +796,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -808,7 +812,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -840,7 +844,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -856,7 +860,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -872,7 +876,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -893,7 +897,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -909,7 +913,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -925,7 +929,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -941,7 +945,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -973,7 +977,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -989,7 +993,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } @@ -1005,7 +1009,7 @@ pipeline { cmd = """ ulimit -c unlimited cd build - CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' .. MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check """ } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3842e76ae0..7a3345a808 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -297,6 +297,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/conv_3x3_wheel_alpha_v9_0_15_gfx9_stride_2_dil.inc kernels/conv_3x3_wheel_alpha_v9_0_15_gfx9_stride_2_dec.inc kernels/conv_3x3_wheel_alpha_v9_0_15_gfx9.inc + kernels/Conv_Winograd_v21_1_2_gfx9_f3x2_fp32_stride1_group.inc + kernels/Conv_Winograd_v21_1_2_gfx10_f3x2_fp32_stride1_group.inc kernels/Conv_Winograd_v21_1_2_gfx9_fp16_dot2_edc_dilation2.inc kernels/Conv_Winograd_v21_1_2_gfx9_fp16_dot2_edc_stride1.inc kernels/Conv_Winograd_v21_1_2_gfx9_fp16_dot2_edc_stride2.inc @@ -423,6 +425,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/conv_3x3_wheel_alpha_v3_0b.s kernels/conv_3x3_wheel_alpha_v9_2_7.s kernels/conv_3x3_wheel_alpha_v9_2_7_stride_2_dec.s + kernels/Conv_Winograd_v21_1_2_f3x2_fp32_stride1_group.s kernels/Conv_Winograd_v21_1_2_fp16_dot2_edc_dilation2.s kernels/Conv_Winograd_v21_1_2_fp16_dot2_edc_stride1.s kernels/Conv_Winograd_v21_1_2_fp16_dot2_edc_stride2.s diff --git a/src/conv/invokers/mlir_impl_gemm.cpp b/src/conv/invokers/mlir_impl_gemm.cpp index e2bf313ab3..4179027089 100644 --- a/src/conv/invokers/mlir_impl_gemm.cpp +++ b/src/conv/invokers/mlir_impl_gemm.cpp @@ -101,27 +101,37 @@ void ComputeMlirDimsStrides(const conv::ProblemDescription& conv_problem, { auto group_count = conv_problem.GetGroupCount(); + TensorDescriptor in; + if(conv_problem.GetDirection() == conv::Direction::Forward) + in = conv_problem.GetIn(); + else + in = conv_problem.GetOut(); + + in_dims = in.GetLengths(); + in_strides = in.GetStrides(); + PermuteDimsStrides(in_dims, in_strides); // Add a virtual group dimension before input channel. - const TensorDescriptor& in = conv_problem.GetIn(); - in_dims = in.GetLengths(); - in_strides = in.GetStrides(); InsertGToDimsStrides(in.GetLayout("NCHW"), 'C', group_count, in_dims, in_strides); - PermuteDimsStrides(in_dims, in_strides); // Add a virtual group dimension before output channel. const TensorDescriptor& weights = conv_problem.GetWeights(); weights_dims = weights.GetLengths(); weights_strides = weights.GetStrides(); + PermuteDimsStrides(weights_dims, weights_strides); InsertGToDimsStrides( weights.GetLayout("NCHW"), 'N', group_count, weights_dims, weights_strides); - PermuteDimsStrides(weights_dims, weights_strides); + TensorDescriptor out; + if(conv_problem.GetDirection() == conv::Direction::Forward) + out = conv_problem.GetOut(); + else + out = conv_problem.GetIn(); + + out_dims = out.GetLengths(); + out_strides = out.GetStrides(); + PermuteDimsStrides(out_dims, out_strides); // Add a virtual group dimension before output channel. - const TensorDescriptor& out = conv_problem.GetOut(); - out_dims = out.GetLengths(); - out_strides = out.GetStrides(); InsertGToDimsStrides(out.GetLayout("NCHW"), 'C', group_count, out_dims, out_strides); - PermuteDimsStrides(out_dims, out_strides); } MlirConvArgs MakeMlirConvArgs(const std::vector& in_dims, @@ -138,11 +148,11 @@ MlirConvArgs MakeMlirConvArgs(const std::vector& in_dims, std::copy(strides.cbegin(), strides.cend(), &target.strides[0]); }; - StridedMemRef5D filter{nullptr, nullptr, 0, {0, 0, 0, 0}, {0, 0, 0, 0}}; + StridedMemRef5D filter{nullptr, nullptr, 0, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}}; initDimStrides(weights_dims, weights_strides, filter); - StridedMemRef5D input{nullptr, nullptr, 0, {0, 0, 0, 0}, {0, 0, 0, 0}}; + StridedMemRef5D input{nullptr, nullptr, 0, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}}; initDimStrides(in_dims, in_strides, input); - StridedMemRef5D output{nullptr, nullptr, 0, {0, 0, 0, 0}, {0, 0, 0, 0}}; + StridedMemRef5D output{nullptr, nullptr, 0, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}}; initDimStrides(out_dims, out_strides, output); return {filter, input, output}; @@ -255,7 +265,7 @@ InvokerFactory MakeMlirBwdInvokerFactory(const ConvolutionContext& ctx) elapsed += handle.GetKernelTime(); } - SetMlirConvArgsPtr(tensors.in, tensors.out, tensors.w, args); + SetMlirConvArgsPtr(tensors.out, tensors.in, tensors.w, args); for(const auto& k : kernels) { handle.Run(k)(args); diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index fe94ecb257..a9e1a3c5fe 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -1451,11 +1451,47 @@ struct ConvBinWinogradRxS : SolverBase ConvSolution GetSolution(const ConvolutionContext& params) const; }; +struct PerformanceConfigConvBinWinogradRxSf3x2 + : Serializable +{ + int n_groups; + PerformanceConfigConvBinWinogradRxSf3x2(int n_groups_); + PerformanceConfigConvBinWinogradRxSf3x2() : PerformanceConfigConvBinWinogradRxSf3x2(-1) {} + PerformanceConfigConvBinWinogradRxSf3x2(bool) : PerformanceConfigConvBinWinogradRxSf3x2(1) {} + + template + static void Visit(Self&& self, F f) + { + f(self.n_groups, "n_groups"); + } + int GetNGroups() const { return n_groups; } + + void HeuristicInit(const ConvolutionContext& config); + bool IsValidValue() const; + bool SetNextValue(); + bool IsValid(const ConvolutionContext& config) const; + bool operator==(const PerformanceConfigConvBinWinogradRxSf3x2& other) const; + std::string ToString() const; +}; + struct ConvBinWinogradRxSf3x2 : SolverBase { + PerformanceConfigConvBinWinogradRxSf3x2 GetPerformanceConfig(const ConvolutionContext&) const; + bool IsValidPerformanceConfig(const ConvolutionContext&, + const PerformanceConfigConvBinWinogradRxSf3x2&) const; + PerformanceConfigConvBinWinogradRxSf3x2 Search(const ConvolutionContext&, + const AnyInvokeParams& invoke_ctx) const; + bool IsApplicable(const ConvolutionContext& params) const; bool IsDynamic() const { return true; } - ConvSolution GetSolution(const ConvolutionContext& params) const; + ConvSolution GetSolution(const ConvolutionContext& params, + const PerformanceConfigConvBinWinogradRxSf3x2& config, + bool disableConfigOverrideFromEnv = false) const; + static size_t GetNGroups(const size_t group_conv, const size_t grid_group_size) + { + assert(group_conv != 0); + return grid_group_size / group_conv; + } }; struct PerformanceConfigConvBinWinogradRxSf2x3 diff --git a/src/kernels/Conv_Winograd_v21_1_2_f3x2_fp32_stride1_group.s b/src/kernels/Conv_Winograd_v21_1_2_f3x2_fp32_stride1_group.s new file mode 100644 index 0000000000..7b4ef0fc52 --- /dev/null +++ b/src/kernels/Conv_Winograd_v21_1_2_f3x2_fp32_stride1_group.s @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +.include "Conv_Winograd_v21_1_2_metadata.inc" + +KERNEL_PROLOG f3x2_fp32_stride1_group + +.if (.amdgcn.gfx_generation_number == 9) + .if (.amdgcn.gfx_generation_stepping == 10) + .error "gfx90a is not supported yet" + .else + .include "Conv_Winograd_v21_1_2_gfx9_f3x2_fp32_stride1_group.inc" + .endif +.elseif (.amdgcn.gfx_generation_number == 10) + .include "Conv_Winograd_v21_1_2_gfx10_f3x2_fp32_stride1_group.inc" +.endif + +KERNEL_EPILOG f3x2_fp32_stride1_group diff --git a/src/kernels/Conv_Winograd_v21_1_2_gfx10_f3x2_fp32_stride1_group.inc b/src/kernels/Conv_Winograd_v21_1_2_gfx10_f3x2_fp32_stride1_group.inc new file mode 100644 index 0000000000..f380b43f78 --- /dev/null +++ b/src/kernels/Conv_Winograd_v21_1_2_gfx10_f3x2_fp32_stride1_group.inc @@ -0,0 +1,4567 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +s_version 0x2004 +s_inst_prefetch 0x3 +v_mov_b32_e32 v1, v0 +s_mov_b32 s0, 0 +s_mov_b32 s1, 0 +s_mov_b32 s2, 0 +s_mov_b32 s3, 0 +v_mov_b32_e32 v187, 0 +s_mov_b32 m0, 0x1ffff +s_mov_b32 s97, 0xc1e0 +s_mov_b32 s96, 0xc1e0 +v_and_b32_e32 v189, 0xc0, v0 +v_add_co_u32_e64 v1, vcc, v0, v189 +s_mov_b32 s91, 0 +v_lshlrev_b32_e32 v190, 2, v1 +v_add_co_u32_e64 v190, vcc, 0xffc0, v190 +v_cmp_ge_u32_e32 vcc, 12, v1 +s_cbranch_vccz 5 +v_mov_b32_e32 v189, 0 +v_cndmask_b32_e32 v190, -1, v190, vcc +ds_write_b32 v190, v189 +s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s52, v1 +s_lshr_b32 s52, s52, 5 +s_add_u32 s52, s52, 8 +s_and_b32 s92, s52, 20 +s_mov_b64 s[40:41], s[6:7] +s_load_dwordx16 s[12:27], s[40:41], 0x0 +s_load_dwordx4 s[28:31], s[40:41], 0x40 +s_load_dwordx2 s[32:33], s[40:41], 0x50 +s_waitcnt lgkmcnt(0) +s_and_b32 s18, s18, 0xffff +s_bitcmp1_b32 s18, 6 +s_cbranch_scc0 16 +s_and_b32 s21, s21, 0xffff +s_and_b32 s23, s23, 0xffff +s_and_b32 s25, s25, 0xffff +s_and_b32 s27, s27, 0xffff +s_load_dwordx2 s[20:21], s[20:21], 0x0 +s_load_dwordx2 s[22:23], s[22:23], 0x0 +s_load_dwordx2 s[24:25], s[24:25], 0x0 +s_load_dwordx2 s[26:27], s[26:27], 0x0 +s_bitcmp1_b32 s18, 7 +s_cbranch_scc0 2 +s_load_dwordx2 s[34:35], s[40:41], 0x58 +s_mov_b32 s36, 1.0 +s_bitcmp1_b32 s18, 8 +s_cbranch_scc0 2 +s_load_dword s36, s[40:41], 0x60 +s_bitcmp1_b32 s18, 7 +s_cbranch_scc0 7 +s_bitcmp1_b32 s18, 6 +s_cbranch_scc0 5 +s_waitcnt lgkmcnt(0) +s_and_b32 s35, s35, 0xffff +s_load_dwordx2 s[34:35], s[34:35], 0x0 +s_bitcmp1_b32 s18, 9 +s_cbranch_scc0 72 +s_mov_b32 s42, 0x8c +s_mov_b32 s43, 0x9c +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cmp_eq_u32 1, src_vccz +s_cselect_b32 s42, s43, s42 +s_load_dword s65, s[40:41], 0x88 +s_load_dword s90, s[40:41], 0x98 +s_load_dword s68, s[40:41], s42 +s_load_dwordx2 s[66:67], s[40:41], 0xa8 +s_bitcmp1_b32 s18, 10 +s_cbranch_scc0 103 +s_load_dwordx4 s[44:47], s[40:41], 0xb8 +v_ffbh_u32_e32 v4, s17 +v_lshlrev_b32_e64 v5, v4, s17 +v_and_b32_e32 v6, 0xffffff00, v5 +v_cmp_eq_u32_e32 vcc, 0x80000000, v5 +v_cvt_f32_u32_e32 v6, v6 +v_rcp_f32_e32 v2, v6 +v_sub_co_ci_u32_e32 v3, vcc, 32, v4, vcc +v_cvt_f32_ubyte0_e32 v4, v5 +v_fma_f32 v6, v6, v2, -1.0 +v_fma_f32 v6, v4, v2, v6 +v_fmaak_f32 v6, v6, v2, 0x9f000000 +v_mul_f32_e32 v6, 0x5f800000, v6 +v_mov_b32_e32 v4, 0 +v_cvt_flr_i32_f32_e64 v6, -v6 +v_lshl_add_u32 v2, v2, 9, v6 +v_mad_u64_u32 v[4:5], vcc, v5, v2, v[4:5] +v_sub_co_ci_u32_e64 v2, vcc, v2, -1, vcc +v_mul_hi_u32 v4, s8, v2 +v_add_co_u32_e64 v2, vcc, v4, s8 +v_add_co_ci_u32_e64 v4, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v3 +v_cndmask_b32_e32 v2, v2, v4, vcc +v_alignbit_b32 v2, v4, v2, v3 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s48, v2 +s_mul_i32 s49, s48, s17 +s_sub_u32 s8, s8, s49 +s_mul_i32 s49, s45, s48 +s_add_u32 s20, s20, s49 +s_addc_u32 s21, s21, 0 +s_mul_i32 s49, s46, s48 +s_add_u32 s22, s22, s49 +s_addc_u32 s23, s23, 0 +s_mul_i32 s49, s47, s48 +s_add_u32 s24, s24, s49 +s_addc_u32 s25, s25, 0 +s_branch 49 +s_mul_i32 s42, s14, s15 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s42 +s_lshr_b32 s47, s42, 16 +s_mul_i32 s47, s47, s13 +s_mul_i32 s44, s46, s13 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s44, s46, s44 +s_addc_u32 s45, s47, 0 +s_lshl_b32 s65, s44, 2 +s_lshl_b32 s68, s42, 2 +s_mul_i32 s43, s32, s33 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s43 +s_lshr_b32 s47, s43, 16 +s_mul_i32 s47, s47, s16 +s_mul_i32 s44, s46, s16 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s44, s46, s44 +s_addc_u32 s45, s47, 0 +s_lshl_b32 s66, s44, 2 +s_lshl_b32 s67, s43, 2 +s_bitcmp1_b32 s18, 13 +s_cbranch_scc0 2 +s_load_dwordx8 s[48:55], s[40:41], 0x68 +s_mul_i32 s42, s28, s29 +s_lshl_b32 s42, s42, 2 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s43, s16, s13 +s_lshr_b32 s44, -1, 16 +s_and_b32 s44, s44, s42 +s_lshr_b32 s45, s42, 16 +s_mul_i32 s45, s45, s43 +s_mul_i32 s56, s44, s43 +s_lshl_b32 s44, s45, 16 +s_lshr_b32 s45, s45, 16 +s_add_u32 s56, s44, s56 +s_addc_u32 s57, s45, 0 +s_mov_b32 s43, s56 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s44, s43, s42 +s_cselect_b32 s90, s42, s43 +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cmp_eq_u32 1, src_vccz +s_cselect_b32 s68, s44, s68 +s_waitcnt lgkmcnt(0) +s_and_b32 s21, s21, 0xffff +s_and_b32 s23, s23, 0xffff +s_and_b32 s25, s25, 0xffff +s_and_b32 s27, s27, 0xffff +s_and_b32 s35, s35, 0xffff +s_bitcmp1_b32 s18, 13 +s_cbranch_scc0 8 +s_add_u32 s20, s20, s48 +s_addc_u32 s21, s21, s49 +s_add_u32 s22, s22, s50 +s_addc_u32 s23, s23, s51 +s_add_u32 s24, s24, s52 +s_addc_u32 s25, s25, s53 +s_add_u32 s34, s34, s54 +s_addc_u32 s35, s35, s55 +s_and_b32 s44, 0, s30 +s_addc_u32 s44, s32, 0 +s_ashr_i32 s44, s44, 0 +s_add_u32 s42, s44, 2 +v_mov_b32_e32 v2, 0x55555556 +v_mul_hi_u32 v2, v2, s42 +v_readfirstlane_b32 s42, v2 +s_andn2_b32 s44, 0, s31 +s_addc_u32 s44, s33, 0 +s_ashr_i32 s44, s44, 0 +s_add_u32 s43, s44, 2 +v_mov_b32_e32 v2, 0x55555556 +v_mul_hi_u32 v2, v2, s43 +v_readfirstlane_b32 s43, v2 +s_sub_u32 s75, 0, s43 +s_sub_u32 s74, 0, s42 +s_add_u32 s60, s28, 1 +v_mov_b32_e32 v2, 0x80000000 +v_mul_hi_u32 v2, v2, s60 +v_readfirstlane_b32 s60, v2 +s_add_u32 s61, s29, 1 +v_mov_b32_e32 v2, 0x80000000 +v_mul_hi_u32 v2, v2, s61 +v_readfirstlane_b32 s61, v2 +v_mad_i32_i24 v2, 2, s60, -1 +v_sub_co_u32_e64 v2, vcc, v2, s28 +v_add_co_ci_u32_e64 v2, vcc, 0, 0, vcc +v_readfirstlane_b32 s44, v2 +s_and_b32 s44, s44, 0 +s_and_b32 s44, s44, s60 +s_add_u32 s60, s60, s44 +v_readfirstlane_b32 s45, v1 +s_and_b32 s48, s45, 64 +s_cselect_b32 s48, 0x80000, 0 +s_or_b32 s18, s18, s48 +s_lshl_b32 s69, s68, 1 +s_mov_b64 s[70:71], 0 +s_bitcmp1_b32 s18, 12 +s_cselect_b32 s44, 0, -1 +s_bitcmp1_b32 s18, 11 +s_cselect_b32 s44, s44, 1 +s_cmp_gt_u32 s61, s44 +s_cbranch_scc0 8 +s_bitset1_b32 s18, 23 +s_bitset1_b32 s18, 20 +s_bitset0_b32 s18, 19 +s_ashr_i32 s69, s69, 1 +s_ashr_i64 s[70:71], s[70:71], 1 +s_add_u32 s61, s61, 1 +s_and_b32 s61, s61, -2 +s_branch 16 +s_and_b32 s48, s13, 1 +s_cselect_b32 s48, 0, 0x1000000 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s48, 0, s48 +s_or_b32 s18, s18, s48 +s_cmp_eq_u32 s48, 0 +s_cselect_b32 s69, s68, s69 +s_cselect_b32 s70, s68, s70 +s_cselect_b32 s71, 0, s71 +s_bitcmp0_b32 s45, 8 +s_cselect_b32 s48, s48, 0 +s_cmp_eq_u32 s48, 0 +s_cselect_b32 s48, 0, 0x80000 +s_andn2_b32 s18, s18, s48 +v_bfe_u32 v3, v1, 2, 6 +v_lshrrev_b32_e32 v182, 1, v3 +s_bitcmp0_b32 s45, 8 +s_cselect_b32 s48, 0x1000000, 0 +s_or_b32 s48, s48, 0x100000 +s_and_b32 s48, s18, s48 +s_cselect_b32 s48, 0, 15 +v_bfi_b32 v182, s48, v3, v182 +s_mul_i32 s88, s12, s42 +s_sub_u32 s88, s88, 1 +s_lshr_b32 s88, s88, 0 +s_add_u32 s88, s88, 1 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s88 +s_lshr_b32 s47, s88, 16 +s_mul_i32 s47, s47, s43 +s_mul_i32 s88, s46, s43 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s88, s46, s88 +s_addc_u32 s89, s47, 0 +s_sub_u32 s88, s88, 1 +s_subb_u32 s89, s89, 0 +s_lshr_b64 s[88:89], s[88:89], 5 +s_add_u32 s88, s88, 1 +s_addc_u32 s89, s89, 0 +v_mov_b32_e32 v4, s8 +v_mov_b32_e32 v5, s17 +v_and_b32_e32 v6, 3, v1 +v_cmp_eq_u32_e32 vcc, 2, v6 +v_cndmask_b32_e32 v4, v4, v5, vcc +v_cmp_eq_u32_e32 vcc, 1, v6 +v_cndmask_b32_e32 v7, 0, v182, vcc +s_bitcmp1_b32 s18, 20 +s_cbranch_scc0 4 +v_add_co_u32_e64 v5, vcc, v182, 8 +v_cmp_eq_u32_e32 vcc, 0, v6 +v_cndmask_b32_e32 v7, v7, v5, vcc +v_cmp_eq_u32_e64 s[46:47], 3, v6 +v_bfe_u32 v180, v7, 0, 5 +v_mad_u32_u24 v180, v4, 32, v180 +v_ffbh_u32_e32 v9, s43 +v_lshlrev_b32_e64 v10, v9, s43 +v_and_b32_e32 v11, 0xffffff00, v10 +v_cmp_eq_u32_e32 vcc, 0x80000000, v10 +v_cvt_f32_u32_e32 v11, v11 +v_rcp_f32_e32 v181, v11 +v_sub_co_ci_u32_e32 v8, vcc, 32, v9, vcc +v_cvt_f32_ubyte0_e32 v9, v10 +v_fma_f32 v11, v11, v181, -1.0 +v_fma_f32 v11, v9, v181, v11 +v_fmaak_f32 v11, v11, v181, 0x9f000000 +v_mul_f32_e32 v11, 0x5f800000, v11 +v_mov_b32_e32 v9, 0 +v_cvt_flr_i32_f32_e64 v11, -v11 +v_lshl_add_u32 v181, v181, 9, v11 +v_mad_u64_u32 v[9:10], vcc, v10, v181, v[9:10] +v_sub_co_ci_u32_e64 v181, vcc, v181, -1, vcc +v_mul_hi_u32 v9, v180, v181 +v_add_co_u32_e64 v181, vcc, v9, v180 +v_add_co_ci_u32_e64 v9, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v8 +v_cndmask_b32_e32 v181, v181, v9, vcc +v_alignbit_b32 v181, v9, v181, v8 +v_mad_i32_i24 v179, v181, s75, v180 +v_lshrrev_b32_e32 v180, 5, v7 +v_mad_u32_u24 v180, v181, 1, v180 +v_cndmask_b32_e64 v180, v180, 1, s[46:47] +v_ffbh_u32_e32 v9, s42 +v_lshlrev_b32_e64 v10, v9, s42 +v_and_b32_e32 v11, 0xffffff00, v10 +v_cmp_eq_u32_e32 vcc, 0x80000000, v10 +v_cvt_f32_u32_e32 v11, v11 +v_rcp_f32_e32 v181, v11 +v_sub_co_ci_u32_e32 v8, vcc, 32, v9, vcc +v_cvt_f32_ubyte0_e32 v9, v10 +v_fma_f32 v11, v11, v181, -1.0 +v_fma_f32 v11, v9, v181, v11 +v_fmaak_f32 v11, v11, v181, 0x9f000000 +v_mul_f32_e32 v11, 0x5f800000, v11 +v_mov_b32_e32 v9, 0 +v_cvt_flr_i32_f32_e64 v11, -v11 +v_lshl_add_u32 v181, v181, 9, v11 +v_mad_u64_u32 v[9:10], vcc, v10, v181, v[9:10] +v_sub_co_ci_u32_e64 v181, vcc, v181, -1, vcc +v_mul_hi_u32 v9, v180, v181 +v_add_co_u32_e64 v181, vcc, v9, v180 +v_add_co_ci_u32_e64 v9, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v8 +v_cndmask_b32_e32 v181, v181, v9, vcc +v_alignbit_b32 v181, v9, v181, v8 +v_mad_i32_i24 v180, v181, s74, v180 +v_readlane_b32 s76, v179, 2 +v_readlane_b32 s77, v180, 2 +v_readlane_b32 s78, v181, 2 +v_readlane_b32 s79, v180, 3 +v_readlane_b32 s80, v181, 3 +v_add_co_u32_e64 v179, vcc, v179, s75 +v_add_co_u32_e64 v180, vcc, v180, s74 +v_mov_b32_dpp v181, v181 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v179, v179 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v180, v180 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +s_mov_b32 s42, 0x80000000 +s_mov_b32 s43, 0x31014000 +s_mov_b32 s46, 0x80000000 +s_mov_b32 s47, 0x31014000 +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cbranch_vccnz 6 +v_xor_b32_dpp v183, v1, v1 quad_perm:[2,3,2,1] row_mask:0xf bank_mask:0xf +v_subrev_co_u32_e64 v183, vcc, 1, v183 +v_cvt_f32_i32_e32 v183, v183 +s_branch 5 +v_xor_b32_dpp v183, v1, v1 quad_perm:[2,1,0,1] row_mask:0xf bank_mask:0xf +v_sub_co_u32_e64 v183, vcc, 1, v183 +v_cvt_f32_i32_e32 v183, v183 +v_mov_b32_e32 v184, 1 +v_xor_b32_dpp v184, v1, v1 quad_perm:[2,3,2,3] row_mask:0xf bank_mask:0x4 +v_xor_b32_dpp v184, v1, v1 quad_perm:[0,1,0,1] row_mask:0xf bank_mask:0x8 +v_subrev_co_u32_e64 v184, vcc, 1, v184 +v_mov_b32_e32 v185, 1 +v_xor_b32_dpp v185, v1, v1 quad_perm:[0,3,2,1] row_mask:0xf bank_mask:0x2 +v_xor_b32_dpp v185, v1, v1 quad_perm:[2,1,0,3] row_mask:0xf bank_mask:0x4 +v_subrev_co_u32_e64 v185, vcc, 1, v185 +v_cvt_f32_i32_e32 v184, v184 +v_cvt_f32_i32_e32 v185, v185 +v_lshrrev_b32_e64 v189, 2, s92 +v_and_b32_e32 v190, 3, v1 +v_bfe_u32 v191, v1, 4, 3 +v_mad_u32_u24 v175, v191, 4, v190 +v_lshlrev_b32_e32 v175, 4, v175 +v_mad_u32_u24 v165, v189, 4, v190 +v_lshlrev_b32_e32 v165, 4, v165 +v_bfe_u32 v189, v1, 2, 2 +v_and_b32_e32 v190, 1, v189 +v_mad_u32_u24 v192, v189, 16, v190 +v_lshlrev_b32_e32 v192, 6, v192 +v_xor_b32_e32 v165, v165, v192 +v_mul_u32_u24_e32 v192, 0x400, v189 +v_xor_b32_e32 v175, v175, v192 +s_lshr_b32 s92, s92, 0 +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cbranch_vccnz 61 +s_and_b32 s53, s18, 0x1100000 +s_addc_u32 s53, 0, 0 +v_lshrrev_b32_e32 v192, 1, v1 +s_mul_i32 s52, 60, s53 +s_sub_u32 s52, 63, s52 +v_bfi_b32 v192, s52, v1, v192 +v_and_b32_e32 v189, 1, v192 +v_bfe_u32 v190, v192, 1, 1 +v_xor_b32_e32 v189, v189, v190 +v_bfe_u32 v191, v192, 3, 1 +v_mad_u32_u24 v190, v190, 2, v191 +v_mul_u32_u24_e32 v189, 0x118, v189 +v_bfe_u32 v191, v192, 2, 1 +v_mad_u32_u24 v190, v190, 2, v189 +v_xor_b32_e32 v190, v190, v191 +v_and_b32_e32 v191, 0xf0, v192 +v_xor_b32_e32 v190, v190, v191 +s_mul_i32 s52, 4, s53 +s_sub_u32 s52, 6, s52 +v_bfe_u32 v192, v1, s52, 1 +v_mul_u32_u24_e32 v192, 0x1040, v192 +v_xor_b32_e32 v168, 0x314, v190 +v_xor_b32_e32 v169, 0x31c, v190 +v_xor_b32_e32 v170, 8, v190 +v_mov_b32_e32 v167, v190 +v_mad_u32_u24 v167, 4, v167, v192 +v_mad_u32_u24 v168, 4, v168, v192 +v_mad_u32_u24 v169, 4, v169, v192 +v_mad_u32_u24 v170, 4, v170, v192 +s_mov_b32 s52, 0x1040 +s_and_b32 s53, s18, 0x1100000 +s_cselect_b32 s52, 0x80, s52 +v_add_co_u32_e64 v171, vcc, v167, s52 +v_add_co_u32_e64 v172, vcc, v168, s52 +v_add_co_u32_e64 v173, vcc, v169, s52 +v_add_co_u32_e64 v174, vcc, v170, s52 +s_branch 57 +s_bfe_u32 s53, s18, 0x10014 +v_lshrrev_b32_e32 v192, 1, v1 +s_mul_i32 s52, 60, s53 +s_sub_u32 s52, 63, s52 +v_bfi_b32 v192, s52, v1, v192 +v_and_b32_e32 v189, 1, v192 +v_bfe_u32 v190, v192, 1, 1 +v_bfe_u32 v191, v192, 3, 1 +v_xor_b32_e32 v189, v189, v190 +v_mad_u32_u24 v190, v190, 2, v191 +v_mul_u32_u24_e32 v189, 0x109, v189 +v_bfe_u32 v191, v192, 2, 1 +v_mad_u32_u24 v190, v190, 2, v189 +v_xor_b32_e32 v190, v190, v191 +v_and_b32_e32 v191, 0xf0, v192 +v_or_b32_e32 v190, v190, v191 +s_mul_i32 s52, 4, s53 +s_sub_u32 s52, 6, s52 +v_bfe_u32 v192, v1, s52, 1 +v_mul_u32_u24_e32 v192, 0x1040, v192 +v_mad_u32_u24 v167, 4, v190, v192 +v_xor_b32_e32 v168, 0x307, v190 +v_mad_u32_u24 v168, 4, v168, v192 +v_xor_b32_e32 v169, 0x30f, v190 +v_mad_u32_u24 v169, 4, v169, v192 +v_xor_b32_e32 v170, 8, v190 +v_mad_u32_u24 v170, 4, v170, v192 +s_mov_b32 s52, 0x1040 +s_bitcmp1_b32 s18, 20 +s_cselect_b32 s52, 0x80, s52 +v_add_co_u32_e64 v171, vcc, v167, s52 +v_add_co_u32_e64 v172, vcc, v168, s52 +v_add_co_u32_e64 v173, vcc, v169, s52 +v_add_co_u32_e64 v174, vcc, v170, s52 +v_subrev_co_u32_e64 v179, vcc, s76, v179 +v_mov_b32_e32 v190, s75 +v_cmp_lt_i32_e32 vcc, v179, v190 +v_sub_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_mad_i32_i24 v179, v189, s75, v179 +v_mad_i32_i24 v181, v189, s80, v181 +v_mad_i32_i24 v180, v189, s79, v180 +v_mov_b32_e32 v190, s74 +v_cmp_lt_i32_e32 vcc, v180, v190 +v_sub_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v189 +v_mad_i32_i24 v180, v189, v190, v180 +v_subrev_co_u32_e64 v180, vcc, s77, v180 +v_cmp_lt_i32_e32 vcc, v180, v190 +v_sub_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v189 +v_mad_i32_i24 v180, v189, s74, v180 +v_subrev_co_u32_e64 v181, vcc, s78, v181 +s_mov_b32 s62, 0 +s_mov_b32 s63, s28 +s_mov_b32 s64, 1 +s_mov_b32 s84, 0 +s_mov_b32 s85, s16 +s_mov_b32 s83, s85 +s_sub_u32 s93, -1, s92 +s_sub_u32 s93, s93, 32 +s_bitset1_b32 s18, 21 +s_mov_b32 s47, 0 +s_mov_b32 s51, 0 +s_mov_b32 s94, 19 +s_mov_b32 s82, 0 +s_bitset1_b32 s18, 26 +s_call_b64 s[38:39], 2771 +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cbranch_vccnz 65 +s_branch 1508 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v113, v115, v113 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v116, v114, v116 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v114, v115, v114 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v115, v115, 1.0, -v114 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v113, v113, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v114, v114, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v115, v115, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v116, v116, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v105 +ds_read_b128 v[69:72], v175 offset:29440 +ds_write_b32 v172, v106 +ds_read_b128 v[73:76], v175 offset:29696 +ds_write_b32 v173, v107 +ds_read_b128 v[85:88], v165 offset:28928 +ds_write_b32 v174, v108 +ds_read_b128 v[89:92], v165 offset:29056 +s_setprio 0 +s_add_u32 s91, s91, 0x100 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v101, v149, s[40:43], 0 offen +buffer_load_dword v103, v151, s[40:43], 0 offen +buffer_load_dword v102, v150, s[40:43], 0 offen +buffer_load_dword v104, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 2581 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v117, v119, v117 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v120, v118, v120 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v118, v119, v118 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v119, v119, 1.0, -v118 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v117, v117, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v118, v118, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v119, v119, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v120, v120, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v109 offset:8256 +ds_read_b128 v[77:80], v175 offset:33536 +ds_write_b32 v168, v110 offset:8256 +ds_read_b128 v[81:84], v175 offset:33792 +ds_write_b32 v169, v111 offset:8256 +ds_read_b128 v[93:96], v165 offset:33024 +ds_write_b32 v170, v112 offset:8256 +ds_read_b128 v[97:100], v165 offset:33152 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v105, v153, s[40:43], 0 offen +buffer_load_dword v107, v155, s[40:43], 0 offen +buffer_load_dword v106, v154, s[40:43], 0 offen +buffer_load_dword v108, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65472 +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 2461 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v121, v123, v121 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v124, v122, v124 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v122, v123, v122 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v123, v123, 1.0, -v122 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v121, v121, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v122, v122, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v123, v123, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v124, v124, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v113 offset:8256 +ds_read_b128 v[69:72], v175 offset:37696 +ds_write_b32 v172, v114 offset:8256 +ds_read_b128 v[73:76], v175 offset:37952 +ds_write_b32 v173, v115 offset:8256 +ds_read_b128 v[85:88], v165 offset:37184 +ds_write_b32 v174, v116 offset:8256 +ds_read_b128 v[89:92], v165 offset:37312 +s_setprio 0 +s_mov_b32 m0, 0x2ffc0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v109, v149, s[40:43], 0 offen +buffer_load_dword v111, v151, s[40:43], 0 offen +buffer_load_dword v110, v150, s[40:43], 0 offen +buffer_load_dword v112, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 2341 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v125, v127, v125 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v128, v126, v128 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v126, v127, v126 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v127, v127, 1.0, -v126 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v125, v125, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v126, v126, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v127, v127, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v128, v128, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v117 offset:16512 +ds_read_b128 v[77:80], v175 offset:41792 +ds_write_b32 v168, v118 offset:16512 +ds_read_b128 v[81:84], v175 offset:42048 +ds_write_b32 v169, v119 offset:16512 +ds_read_b128 v[93:96], v165 offset:41280 +ds_write_b32 v170, v120 offset:16512 +ds_read_b128 v[97:100], v165 offset:41408 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v113, v153, s[40:43], 0 offen +buffer_load_dword v115, v155, s[40:43], 0 offen +buffer_load_dword v114, v154, s[40:43], 0 offen +buffer_load_dword v116, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 5 +s_call_b64 s[38:39], 2220 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v129, v131, v129 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v132, v130, v132 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v130, v131, v130 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v131, v131, 1.0, -v130 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v129, v129, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v130, v130, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v131, v131, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v132, v132, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v121 offset:16512 +ds_read_b128 v[69:72], v175 offset:45952 +ds_write_b32 v172, v122 offset:16512 +ds_read_b128 v[73:76], v175 offset:46208 +ds_write_b32 v173, v123 offset:16512 +ds_read_b128 v[85:88], v165 offset:45440 +ds_write_b32 v174, v124 offset:16512 +ds_read_b128 v[89:92], v165 offset:45568 +s_setprio 0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v117, v149, s[40:43], 0 offen +buffer_load_dword v119, v151, s[40:43], 0 offen +buffer_load_dword v118, v150, s[40:43], 0 offen +buffer_load_dword v120, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 8 +s_call_b64 s[38:39], 2103 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v133, v135, v133 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v136, v134, v136 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v134, v135, v134 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v135, v135, 1.0, -v134 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v133, v133, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v134, v134, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v135, v135, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v136, v136, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v125 offset:24768 +ds_read_b128 v[77:80], v175 offset:512 +ds_write_b32 v168, v126 offset:24768 +ds_read_b128 v[81:84], v175 offset:768 +ds_write_b32 v169, v127 offset:24768 +ds_read_b128 v[93:96], v165 +ds_write_b32 v170, v128 offset:24768 +ds_read_b128 v[97:100], v165 offset:128 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v121, v153, s[40:43], 0 offen +buffer_load_dword v123, v155, s[40:43], 0 offen +buffer_load_dword v122, v154, s[40:43], 0 offen +buffer_load_dword v124, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65476 +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 1981 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v137, v139, v137 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v140, v138, v140 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v138, v139, v138 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v139, v139, 1.0, -v138 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v137, v137, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v138, v138, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v139, v139, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v140, v140, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v129 offset:24768 +ds_read_b128 v[69:72], v175 offset:4672 +ds_write_b32 v172, v130 offset:24768 +ds_read_b128 v[73:76], v175 offset:4928 +ds_write_b32 v173, v131 offset:24768 +ds_read_b128 v[85:88], v165 offset:4160 +ds_write_b32 v174, v132 offset:24768 +ds_read_b128 v[89:92], v165 offset:4288 +s_setprio 0 +s_mov_b32 m0, 0x2ffc4 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v125, v149, s[40:43], 0 offen +buffer_load_dword v127, v151, s[40:43], 0 offen +buffer_load_dword v126, v150, s[40:43], 0 offen +buffer_load_dword v128, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 1861 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v141, v143, v141 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v144, v142, v144 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v142, v143, v142 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v143, v143, 1.0, -v142 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v141, v141, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v142, v142, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v143, v143, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v144, v144, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v133 offset:33024 +ds_read_b128 v[77:80], v175 offset:8768 +ds_write_b32 v168, v134 offset:33024 +ds_read_b128 v[81:84], v175 offset:9024 +ds_write_b32 v169, v135 offset:33024 +ds_read_b128 v[93:96], v165 offset:8256 +ds_write_b32 v170, v136 offset:33024 +ds_read_b128 v[97:100], v165 offset:8384 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v129, v153, s[40:43], 0 offen +buffer_load_dword v131, v155, s[40:43], 0 offen +buffer_load_dword v130, v154, s[40:43], 0 offen +buffer_load_dword v132, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 5 +s_call_b64 s[38:39], 1740 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v145, v147, v145 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v148, v146, v148 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v146, v147, v146 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v147, v147, 1.0, -v146 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v145, v145, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v146, v146, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v147, v147, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v148, v148, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v137 offset:33024 +ds_read_b128 v[69:72], v175 offset:12928 +ds_write_b32 v172, v138 offset:33024 +ds_read_b128 v[73:76], v175 offset:13184 +ds_write_b32 v173, v139 offset:33024 +ds_read_b128 v[85:88], v165 offset:12416 +ds_write_b32 v174, v140 offset:33024 +ds_read_b128 v[89:92], v165 offset:12544 +s_setprio 0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v133, v149, s[40:43], 0 offen +buffer_load_dword v135, v151, s[40:43], 0 offen +buffer_load_dword v134, v150, s[40:43], 0 offen +buffer_load_dword v136, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 8 +s_call_b64 s[38:39], 1623 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v101, v103, v101 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v104, v102, v104 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v102, v103, v102 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v103, v103, 1.0, -v102 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v101, v101, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v102, v102, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v103, v103, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v104, v104, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v141 offset:41280 +ds_read_b128 v[77:80], v175 offset:17024 +ds_write_b32 v168, v142 offset:41280 +ds_read_b128 v[81:84], v175 offset:17280 +ds_write_b32 v169, v143 offset:41280 +ds_read_b128 v[93:96], v165 offset:16512 +ds_write_b32 v170, v144 offset:41280 +ds_read_b128 v[97:100], v165 offset:16640 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v137, v153, s[40:43], 0 offen +buffer_load_dword v139, v155, s[40:43], 0 offen +buffer_load_dword v138, v154, s[40:43], 0 offen +buffer_load_dword v140, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65480 +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 1501 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v105, v107, v105 div:2 +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_subrev_f32_e64 v108, v106, v108 div:2 +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v106, v107, v106 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_fma_f32 v107, v107, 1.0, -v106 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_dpp v105, v105, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_dpp v106, v106, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_dpp v107, v107, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_dpp v108, v108, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v145 offset:41280 +ds_read_b128 v[69:72], v175 offset:21184 +ds_write_b32 v172, v146 offset:41280 +ds_read_b128 v[73:76], v175 offset:21440 +ds_write_b32 v173, v147 offset:41280 +ds_read_b128 v[85:88], v165 offset:20672 +ds_write_b32 v174, v148 offset:41280 +ds_read_b128 v[89:92], v165 offset:20800 +s_setprio 0 +s_mov_b32 m0, 0x2ffc8 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v141, v149, s[40:43], 0 offen +buffer_load_dword v143, v151, s[40:43], 0 offen +buffer_load_dword v142, v150, s[40:43], 0 offen +buffer_load_dword v144, v152, s[40:43], 0 offen +s_waitcnt vmcnt(28) lgkmcnt(8) +s_setprio 1 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 6 +s_call_b64 s[38:39], 1381 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_subrev_f32_e64 v109, v111, v109 div:2 +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_subrev_f32_e64 v112, v110, v112 div:2 +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v110, v111, v110 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_fma_f32 v111, v111, 1.0, -v110 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_dpp v109, v109, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_dpp v110, v110, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_dpp v111, v111, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_dpp v112, v112, v183 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v101 +ds_read_b128 v[77:80], v175 offset:25280 +ds_write_b32 v168, v102 +ds_read_b128 v[81:84], v175 offset:25536 +ds_write_b32 v169, v103 +ds_read_b128 v[93:96], v165 offset:24768 +ds_write_b32 v170, v104 +ds_read_b128 v[97:100], v165 offset:24896 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v145, v153, s[40:43], 0 offen +buffer_load_dword v147, v155, s[40:43], 0 offen +buffer_load_dword v146, v154, s[40:43], 0 offen +buffer_load_dword v148, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_setprio 1 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 64101 +s_call_b64 s[38:39], 1260 +s_branch 64099 +s_nop 0 +s_nop 0 +s_nop 0 +v_fmac_f32_dpp v113, v113, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v116, v116, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v114, v113, v116 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v115, v113, -v116 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v105 +ds_read_b128 v[69:72], v175 offset:29440 +ds_write_b32 v172, v106 +ds_read_b128 v[73:76], v175 offset:29696 +ds_write_b32 v173, v107 +ds_read_b128 v[85:88], v165 offset:28928 +ds_write_b32 v174, v108 +ds_read_b128 v[89:92], v165 offset:29056 +s_setprio 0 +s_add_u32 s91, s91, 0x100 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v101, v149, s[40:43], 0 offen +buffer_load_dword v104, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 1153 +s_nop 0 +v_fmac_f32_dpp v117, v117, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v120, v120, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v118, v117, v120 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v119, v117, -v120 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v109 offset:8256 +ds_read_b128 v[77:80], v175 offset:33536 +ds_write_b32 v168, v110 offset:8256 +ds_read_b128 v[81:84], v175 offset:33792 +ds_write_b32 v169, v111 offset:8256 +ds_read_b128 v[93:96], v165 offset:33024 +ds_write_b32 v170, v112 offset:8256 +ds_read_b128 v[97:100], v165 offset:33152 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v105, v153, s[40:43], 0 offen +buffer_load_dword v108, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65472 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 1049 +s_nop 0 +v_fmac_f32_dpp v121, v121, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v124, v124, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v122, v121, v124 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v123, v121, -v124 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v113 offset:8256 +ds_read_b128 v[69:72], v175 offset:37696 +ds_write_b32 v172, v114 offset:8256 +ds_read_b128 v[73:76], v175 offset:37952 +ds_write_b32 v173, v115 offset:8256 +ds_read_b128 v[85:88], v165 offset:37184 +ds_write_b32 v174, v116 offset:8256 +ds_read_b128 v[89:92], v165 offset:37312 +s_setprio 0 +s_mov_b32 m0, 0x2ffc0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v109, v149, s[40:43], 0 offen +buffer_load_dword v112, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 945 +s_nop 0 +v_fmac_f32_dpp v125, v125, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v128, v128, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v126, v125, v128 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v127, v125, -v128 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v117 offset:16512 +ds_read_b128 v[77:80], v175 offset:41792 +ds_write_b32 v168, v118 offset:16512 +ds_read_b128 v[81:84], v175 offset:42048 +ds_write_b32 v169, v119 offset:16512 +ds_read_b128 v[93:96], v165 offset:41280 +ds_write_b32 v170, v120 offset:16512 +ds_read_b128 v[97:100], v165 offset:41408 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v113, v153, s[40:43], 0 offen +buffer_load_dword v116, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 840 +v_fmac_f32_dpp v129, v129, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v132, v132, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v130, v129, v132 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v131, v129, -v132 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v121 offset:16512 +ds_read_b128 v[69:72], v175 offset:45952 +ds_write_b32 v172, v122 offset:16512 +ds_read_b128 v[73:76], v175 offset:46208 +ds_write_b32 v173, v123 offset:16512 +ds_read_b128 v[85:88], v165 offset:45440 +ds_write_b32 v174, v124 offset:16512 +ds_read_b128 v[89:92], v165 offset:45568 +s_setprio 0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v117, v149, s[40:43], 0 offen +buffer_load_dword v120, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 4 +s_call_b64 s[38:39], 739 +s_nop 0 +s_nop 0 +s_nop 0 +v_fmac_f32_dpp v133, v133, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v136, v136, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v134, v133, v136 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v135, v133, -v136 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v125 offset:24768 +ds_read_b128 v[77:80], v175 offset:512 +ds_write_b32 v168, v126 offset:24768 +ds_read_b128 v[81:84], v175 offset:768 +ds_write_b32 v169, v127 offset:24768 +ds_read_b128 v[93:96], v165 +ds_write_b32 v170, v128 offset:24768 +ds_read_b128 v[97:100], v165 offset:128 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v121, v153, s[40:43], 0 offen +buffer_load_dword v124, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65476 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 633 +s_nop 0 +v_fmac_f32_dpp v137, v137, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v140, v140, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v138, v137, v140 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v139, v137, -v140 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v129 offset:24768 +ds_read_b128 v[69:72], v175 offset:4672 +ds_write_b32 v172, v130 offset:24768 +ds_read_b128 v[73:76], v175 offset:4928 +ds_write_b32 v173, v131 offset:24768 +ds_read_b128 v[85:88], v165 offset:4160 +ds_write_b32 v174, v132 offset:24768 +ds_read_b128 v[89:92], v165 offset:4288 +s_setprio 0 +s_mov_b32 m0, 0x2ffc4 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v125, v149, s[40:43], 0 offen +buffer_load_dword v128, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 529 +s_nop 0 +v_fmac_f32_dpp v141, v141, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v144, v144, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v142, v141, v144 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v143, v141, -v144 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v133 offset:33024 +ds_read_b128 v[77:80], v175 offset:8768 +ds_write_b32 v168, v134 offset:33024 +ds_read_b128 v[81:84], v175 offset:9024 +ds_write_b32 v169, v135 offset:33024 +ds_read_b128 v[93:96], v165 offset:8256 +ds_write_b32 v170, v136 offset:33024 +ds_read_b128 v[97:100], v165 offset:8384 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v129, v153, s[40:43], 0 offen +buffer_load_dword v132, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 424 +v_fmac_f32_dpp v145, v145, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v148, v148, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v146, v145, v148 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v147, v145, -v148 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v137 offset:33024 +ds_read_b128 v[69:72], v175 offset:12928 +ds_write_b32 v172, v138 offset:33024 +ds_read_b128 v[73:76], v175 offset:13184 +ds_write_b32 v173, v139 offset:33024 +ds_read_b128 v[85:88], v165 offset:12416 +ds_write_b32 v174, v140 offset:33024 +ds_read_b128 v[89:92], v165 offset:12544 +s_setprio 0 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v133, v149, s[40:43], 0 offen +buffer_load_dword v136, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 4 +s_call_b64 s[38:39], 323 +s_nop 0 +s_nop 0 +s_nop 0 +v_fmac_f32_dpp v101, v101, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v104, v104, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v102, v101, v104 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v103, v101, -v104 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +ds_write_b32 v167, v141 offset:41280 +ds_read_b128 v[77:80], v175 offset:17024 +ds_write_b32 v168, v142 offset:41280 +ds_read_b128 v[81:84], v175 offset:17280 +ds_write_b32 v169, v143 offset:41280 +ds_read_b128 v[93:96], v165 offset:16512 +ds_write_b32 v170, v144 offset:41280 +ds_read_b128 v[97:100], v165 offset:16640 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v137, v153, s[40:43], 0 offen +buffer_load_dword v140, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +ds_append v188 offset:65480 +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 217 +s_nop 0 +v_fmac_f32_dpp v105, v105, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v69, v85 +v_fmac_f32_e32 v3, v70, v85 +v_fmac_f32_e32 v4, v71, v85 +v_fmac_f32_e32 v5, v72, v85 +v_fmac_f32_e32 v6, v73, v85 +v_fmac_f32_e32 v7, v74, v85 +v_fmac_f32_e32 v8, v75, v85 +v_fmac_f32_e32 v9, v76, v85 +v_fmac_f32_dpp v108, v108, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v69, v86 +v_fmac_f32_e32 v11, v70, v86 +v_fmac_f32_e32 v12, v71, v86 +s_setprio 1 +v_fmac_f32_e32 v13, v72, v86 +v_fmac_f32_e32 v14, v73, v86 +v_fmac_f32_e32 v15, v74, v86 +v_fmac_f32_e32 v16, v75, v86 +v_fmac_f32_e32 v17, v76, v86 +v_add_f32_e64 v106, v105, v108 div:2 +v_fmac_f32_e32 v18, v69, v87 +v_fmac_f32_e32 v19, v70, v87 +v_fmac_f32_e32 v20, v71, v87 +v_fmac_f32_e32 v21, v72, v87 +v_fmac_f32_e32 v22, v73, v87 +v_fmac_f32_e32 v23, v74, v87 +v_fmac_f32_e32 v24, v75, v87 +v_fmac_f32_e32 v25, v76, v87 +v_add_f32_e64 v107, v105, -v108 div:2 +v_fmac_f32_e32 v26, v69, v88 +v_fmac_f32_e32 v27, v70, v88 +v_fmac_f32_e32 v28, v71, v88 +v_fmac_f32_e32 v29, v72, v88 +v_fmac_f32_e32 v30, v73, v88 +v_fmac_f32_e32 v31, v74, v88 +v_fmac_f32_e32 v32, v75, v88 +v_fmac_f32_e32 v33, v76, v88 +v_fmac_f32_e32 v34, v69, v89 +v_fmac_f32_e32 v35, v70, v89 +v_fmac_f32_e32 v36, v71, v89 +v_fmac_f32_e32 v37, v72, v89 +v_fmac_f32_e32 v38, v73, v89 +v_fmac_f32_e32 v39, v74, v89 +v_fmac_f32_e32 v40, v75, v89 +v_fmac_f32_e32 v41, v76, v89 +v_fmac_f32_e32 v42, v69, v90 +v_fmac_f32_e32 v43, v70, v90 +v_fmac_f32_e32 v44, v71, v90 +v_fmac_f32_e32 v45, v72, v90 +v_fmac_f32_e32 v46, v73, v90 +v_fmac_f32_e32 v47, v74, v90 +v_fmac_f32_e32 v48, v75, v90 +v_fmac_f32_e32 v49, v76, v90 +v_fmac_f32_e32 v50, v69, v91 +v_fmac_f32_e32 v51, v70, v91 +v_fmac_f32_e32 v52, v71, v91 +v_fmac_f32_e32 v53, v72, v91 +v_fmac_f32_e32 v54, v73, v91 +v_fmac_f32_e32 v55, v74, v91 +v_fmac_f32_e32 v56, v75, v91 +v_fmac_f32_e32 v57, v76, v91 +v_fmac_f32_e32 v58, v69, v92 +v_fmac_f32_e32 v59, v70, v92 +v_fmac_f32_e32 v60, v71, v92 +v_fmac_f32_e32 v61, v72, v92 +v_fmac_f32_e32 v62, v73, v92 +v_fmac_f32_e32 v63, v74, v92 +v_fmac_f32_e32 v64, v75, v92 +v_fmac_f32_e32 v65, v76, v92 +ds_write_b32 v171, v145 offset:41280 +ds_read_b128 v[69:72], v175 offset:21184 +ds_write_b32 v172, v146 offset:41280 +ds_read_b128 v[73:76], v175 offset:21440 +ds_write_b32 v173, v147 offset:41280 +ds_read_b128 v[85:88], v165 offset:20672 +ds_write_b32 v174, v148 offset:41280 +ds_read_b128 v[89:92], v165 offset:20800 +s_setprio 0 +s_mov_b32 m0, 0x2ffc8 +s_add_u32 s40, s40, s69 +s_addc_u32 s41, s41, 0 +buffer_load_dword v141, v149, s[40:43], 0 offen +buffer_load_dword v144, v152, s[40:43], 0 offen +s_waitcnt vmcnt(14) lgkmcnt(8) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 2 +s_call_b64 s[38:39], 113 +s_nop 0 +v_fmac_f32_dpp v109, v109, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v2, v77, v93 +v_fmac_f32_e32 v3, v78, v93 +v_fmac_f32_e32 v4, v79, v93 +v_fmac_f32_e32 v5, v80, v93 +v_fmac_f32_e32 v6, v81, v93 +v_fmac_f32_e32 v7, v82, v93 +v_fmac_f32_e32 v8, v83, v93 +v_fmac_f32_e32 v9, v84, v93 +v_fmac_f32_dpp v112, v112, v183 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_fmac_f32_e32 v10, v77, v94 +v_fmac_f32_e32 v11, v78, v94 +v_fmac_f32_e32 v12, v79, v94 +s_setprio 1 +v_fmac_f32_e32 v13, v80, v94 +v_fmac_f32_e32 v14, v81, v94 +v_fmac_f32_e32 v15, v82, v94 +v_fmac_f32_e32 v16, v83, v94 +v_fmac_f32_e32 v17, v84, v94 +v_add_f32_e64 v110, v109, v112 div:2 +v_fmac_f32_e32 v18, v77, v95 +v_fmac_f32_e32 v19, v78, v95 +v_fmac_f32_e32 v20, v79, v95 +v_fmac_f32_e32 v21, v80, v95 +v_fmac_f32_e32 v22, v81, v95 +v_fmac_f32_e32 v23, v82, v95 +v_fmac_f32_e32 v24, v83, v95 +v_fmac_f32_e32 v25, v84, v95 +v_add_f32_e64 v111, v109, -v112 div:2 +v_fmac_f32_e32 v26, v77, v96 +v_fmac_f32_e32 v27, v78, v96 +v_fmac_f32_e32 v28, v79, v96 +v_fmac_f32_e32 v29, v80, v96 +v_fmac_f32_e32 v30, v81, v96 +v_fmac_f32_e32 v31, v82, v96 +v_fmac_f32_e32 v32, v83, v96 +v_fmac_f32_e32 v33, v84, v96 +v_fmac_f32_e32 v34, v77, v97 +v_fmac_f32_e32 v35, v78, v97 +v_fmac_f32_e32 v36, v79, v97 +v_fmac_f32_e32 v37, v80, v97 +v_fmac_f32_e32 v38, v81, v97 +v_fmac_f32_e32 v39, v82, v97 +v_fmac_f32_e32 v40, v83, v97 +v_fmac_f32_e32 v41, v84, v97 +v_fmac_f32_e32 v42, v77, v98 +v_fmac_f32_e32 v43, v78, v98 +v_fmac_f32_e32 v44, v79, v98 +v_fmac_f32_e32 v45, v80, v98 +v_fmac_f32_e32 v46, v81, v98 +v_fmac_f32_e32 v47, v82, v98 +v_fmac_f32_e32 v48, v83, v98 +v_fmac_f32_e32 v49, v84, v98 +v_fmac_f32_e32 v50, v77, v99 +v_fmac_f32_e32 v51, v78, v99 +v_fmac_f32_e32 v52, v79, v99 +v_fmac_f32_e32 v53, v80, v99 +v_fmac_f32_e32 v54, v81, v99 +v_fmac_f32_e32 v55, v82, v99 +v_fmac_f32_e32 v56, v83, v99 +v_fmac_f32_e32 v57, v84, v99 +v_fmac_f32_e32 v58, v77, v100 +v_fmac_f32_e32 v59, v78, v100 +v_fmac_f32_e32 v60, v79, v100 +v_fmac_f32_e32 v61, v80, v100 +v_fmac_f32_e32 v62, v81, v100 +v_fmac_f32_e32 v63, v82, v100 +v_fmac_f32_e32 v64, v83, v100 +v_fmac_f32_e32 v65, v84, v100 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_cbranch_vccz 65533 +ds_write_b32 v167, v101 +ds_read_b128 v[77:80], v175 offset:25280 +ds_write_b32 v168, v102 +ds_read_b128 v[81:84], v175 offset:25536 +ds_write_b32 v169, v103 +ds_read_b128 v[93:96], v165 offset:24768 +ds_write_b32 v170, v104 +ds_read_b128 v[97:100], v165 offset:24896 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v145, v153, s[40:43], 0 offen +buffer_load_dword v148, v156, s[40:43], 0 offen +s_waitcnt lgkmcnt(8) +s_bitset1_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 64289 +s_call_b64 s[38:39], 8 +s_branch 64287 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_nop +s_cmp_eq_u32 s82, 0 +s_cbranch_scc0 6 +s_branch 724 +s_bitcmp1_b32 s18, 26 +s_cselect_b32 s52, s69, s70 +s_cselect_b32 s53, 0, s71 +s_sub_u32 s40, s40, s52 +s_subb_u32 s41, s41, s53 +s_cmp_eq_u32 s94, 0 +s_cbranch_scc0 5 +s_cbranch_scc1 748 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_min_u32 s72, s82, s94 +s_sub_u32 s82, s82, s72 +s_sub_u32 s94, s94, s72 +s_sub_u32 s72, s72, 1 +s_setpc_b64 s[38:39] +s_nop 0 +s_nop 0 +s_nop 0 +s_bitcmp1_b32 s18, 17 +s_cbranch_scc1 253 +s_add_u32 s88, s88, s17 +s_cmp_eq_u32 s88, 0 +s_cbranch_scc1 250 +s_mov_b32 s89, 0 +s_bitcmp1_b32 s18, 16 +s_cbranch_scc1 239 +s_add_u32 s87, s16, 31 +s_lshr_b32 s87, s87, 5 +v_mov_b32_e32 v190, s88 +v_mul_u32_u24_e32 v190, s87, v190 +v_add_co_u32_e64 v190, vcc, s17, v190 +v_sub_co_u32_e64 v190, vcc, v190, 1 +v_ffbh_u32_e32 v193, s17 +v_lshlrev_b32_e64 v194, v193, s17 +v_and_b32_e32 v195, 0xffffff00, v194 +v_cmp_eq_u32_e32 vcc, 0x80000000, v194 +v_cvt_f32_u32_e32 v195, v195 +v_rcp_f32_e32 v189, v195 +v_sub_co_ci_u32_e32 v192, vcc, 32, v193, vcc +v_cvt_f32_ubyte0_e32 v193, v194 +v_fma_f32 v195, v195, v189, -1.0 +v_fma_f32 v195, v193, v189, v195 +v_fmaak_f32 v195, v195, v189, 0x9f000000 +v_mul_f32_e32 v195, 0x5f800000, v195 +v_mov_b32_e32 v193, 0 +v_cvt_flr_i32_f32_e64 v195, -v195 +v_lshl_add_u32 v189, v189, 9, v195 +v_mad_u64_u32 v[193:194], vcc, v194, v189, v[193:194] +v_sub_co_ci_u32_e64 v189, vcc, v189, -1, vcc +v_mul_hi_u32 v193, v190, v189 +v_add_co_u32_e64 v189, vcc, v193, v190 +v_add_co_ci_u32_e64 v193, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v192 +v_cndmask_b32_e32 v189, v189, v193, vcc +v_alignbit_b32 v189, v193, v189, v192 +v_readfirstlane_b32 s86, v189 +v_mul_u32_u24_e64 v189, v189, s8 +v_ffbh_u32_e32 v193, s87 +v_lshlrev_b32_e64 v194, v193, s87 +v_and_b32_e32 v195, 0xffffff00, v194 +v_cmp_eq_u32_e32 vcc, 0x80000000, v194 +v_cvt_f32_u32_e32 v195, v195 +v_rcp_f32_e32 v190, v195 +v_sub_co_ci_u32_e32 v192, vcc, 32, v193, vcc +v_cvt_f32_ubyte0_e32 v193, v194 +v_fma_f32 v195, v195, v190, -1.0 +v_fma_f32 v195, v193, v190, v195 +v_fmaak_f32 v195, v195, v190, 0x9f000000 +v_mul_f32_e32 v195, 0x5f800000, v195 +v_mov_b32_e32 v193, 0 +v_cvt_flr_i32_f32_e64 v195, -v195 +v_lshl_add_u32 v190, v190, 9, v195 +v_mad_u64_u32 v[193:194], vcc, v194, v190, v[193:194] +v_sub_co_ci_u32_e64 v190, vcc, v190, -1, vcc +v_mul_hi_u32 v193, v189, v190 +v_add_co_u32_e64 v190, vcc, v193, v189 +v_add_co_ci_u32_e64 v193, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v192 +v_cndmask_b32_e32 v190, v190, v193, vcc +v_alignbit_b32 v190, v193, v190, v192 +v_readfirstlane_b32 s52, v189 +v_readfirstlane_b32 s84, v190 +s_mul_i32 s84, s84, s87 +s_sub_u32 s84, s52, s84 +v_sub_co_u32_e64 v190, vcc, s8, v190 +v_sub_co_u32_e64 v190, vcc, s17, v190 +v_and_b32_e64 v192, v1, 63 +v_cmp_eq_u32_e64 vcc, v192, 0 +v_cndmask_b32_e32 v190, 1, v190, vcc +s_sub_u32 s58, 0, s75 +s_sub_u32 s59, 0, s74 +v_mul_u32_u24_e64 v194, v190, 32 +v_ffbh_u32_e32 v196, s58 +v_lshlrev_b32_e64 v197, v196, s58 +v_and_b32_e32 v198, 0xffffff00, v197 +v_cmp_eq_u32_e32 vcc, 0x80000000, v197 +v_cvt_f32_u32_e32 v198, v198 +v_rcp_f32_e32 v192, v198 +v_sub_co_ci_u32_e32 v195, vcc, 32, v196, vcc +v_cvt_f32_ubyte0_e32 v196, v197 +v_fma_f32 v198, v198, v192, -1.0 +v_fma_f32 v198, v196, v192, v198 +v_fmaak_f32 v198, v198, v192, 0x9f000000 +v_mul_f32_e32 v198, 0x5f800000, v198 +v_mov_b32_e32 v196, 0 +v_cvt_flr_i32_f32_e64 v198, -v198 +v_lshl_add_u32 v192, v192, 9, v198 +v_mad_u64_u32 v[196:197], vcc, v197, v192, v[196:197] +v_sub_co_ci_u32_e64 v192, vcc, v192, -1, vcc +v_mul_hi_u32 v196, v194, v192 +v_add_co_u32_e64 v192, vcc, v196, v194 +v_add_co_ci_u32_e64 v196, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v195 +v_cndmask_b32_e32 v192, v192, v196, vcc +v_alignbit_b32 v192, v196, v192, v195 +v_mad_i32_i24 v193, v192, s75, v194 +v_mul_u32_u24_e64 v194, v192, 1 +v_ffbh_u32_e32 v196, s59 +v_lshlrev_b32_e64 v197, v196, s59 +v_and_b32_e32 v198, 0xffffff00, v197 +v_cmp_eq_u32_e32 vcc, 0x80000000, v197 +v_cvt_f32_u32_e32 v198, v198 +v_rcp_f32_e32 v192, v198 +v_sub_co_ci_u32_e32 v195, vcc, 32, v196, vcc +v_cvt_f32_ubyte0_e32 v196, v197 +v_fma_f32 v198, v198, v192, -1.0 +v_fma_f32 v198, v196, v192, v198 +v_fmaak_f32 v198, v198, v192, 0x9f000000 +v_mul_f32_e32 v198, 0x5f800000, v198 +v_mov_b32_e32 v196, 0 +v_cvt_flr_i32_f32_e64 v198, -v198 +v_lshl_add_u32 v192, v192, 9, v198 +v_mad_u64_u32 v[196:197], vcc, v197, v192, v[196:197] +v_sub_co_ci_u32_e64 v192, vcc, v192, -1, vcc +v_mul_hi_u32 v196, v194, v192 +v_add_co_u32_e64 v192, vcc, v196, v194 +v_add_co_ci_u32_e64 v196, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v195 +v_cndmask_b32_e32 v192, v192, v196, vcc +v_alignbit_b32 v192, v196, v192, v195 +v_mad_i32_i24 v194, v192, s74, v194 +v_readfirstlane_b32 s76, v193 +v_readfirstlane_b32 s77, v194 +v_readfirstlane_b32 s78, v192 +v_add_co_u32_e64 v179, vcc, s76, v179 +v_add_co_ci_u32_e64 v195, vcc, 0, 0, vcc +v_mad_i32_i24 v179, v195, s75, v179 +v_mad_i32_i24 v181, v195, s80, v181 +v_mad_i32_i24 v180, v195, s79, v180 +v_cmp_ge_i32_e64 vcc, v180, 0 +v_add_co_ci_u32_e64 v195, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v195 +v_mad_i32_i24 v180, v195, s74, v180 +v_add_co_u32_e64 v180, vcc, s77, v180 +v_add_co_ci_u32_e64 v195, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v195 +v_mad_i32_i24 v180, v195, s74, v180 +v_add_co_u32_e64 v181, vcc, s78, v181 +v_readlane_b32 s76, v193, 1 +v_readlane_b32 s77, v194, 1 +v_readlane_b32 s78, v192, 1 +s_add_u32 s85, s84, s86 +s_cmp_le_u32 s85, s87 +s_cselect_b32 s52, 0x20000, 0 +s_cselect_b32 s85, s85, s87 +s_or_b32 s18, s18, s52 +s_lshl_b32 s84, s84, 5 +s_lshl_b32 s85, s85, 5 +s_min_u32 s85, s85, s16 +s_cmp_eq_u32 s8, s17 +s_cselect_b32 s52, 0x20000, 0 +s_or_b32 s18, s18, s52 +s_or_b32 s18, s18, s52 +s_bitset1_b32 s18, 16 +s_branch 48 +s_lshr_b32 s84, s84, 5 +s_add_u32 s85, s84, s86 +s_sub_u32 s85, s85, s87 +s_mov_b32 s84, 0 +s_lshl_b32 s85, s85, 5 +s_min_u32 s85, s85, s16 +s_bitset1_b32 s18, 17 +s_branch 12 +s_bitset1_b32 s18, 18 +s_mov_b32 s43, 0 +s_mov_b32 s73, -1 +s_mov_b32 s82, 40 +s_branch 36 +s_add_u32 s83, s83, 32 +s_cmp_ge_u32 s83, s85 +s_cbranch_scc0 33 +s_bitset1_b32 s18, 22 +s_sub_u32 s88, s88, s17 +s_subb_u32 s89, s89, 0 +s_cbranch_scc1 65269 +v_add_co_u32_e64 v179, vcc, s76, v179 +v_add_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_mad_i32_i24 v179, v189, s75, v179 +v_mad_i32_i24 v181, v189, s80, v181 +v_mad_i32_i24 v180, v189, s79, v180 +v_cmp_ge_i32_e64 vcc, v180, 0 +v_add_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v189 +v_mad_i32_i24 v180, v189, s74, v180 +v_add_co_u32_e64 v180, vcc, s77, v180 +v_add_co_ci_u32_e64 v189, vcc, 0, 0, vcc +v_add_co_u32_e64 v181, vcc, v181, v189 +v_mad_i32_i24 v180, v189, s74, v180 +v_add_co_u32_e64 v181, vcc, s78, v181 +s_mov_b32 s83, s84 +v_cmp_le_u32_e32 vcc, 0x100, v1 +s_cbranch_vccz 255 +v_subrev_co_u32_e64 v189, vcc, s75, v179 +v_subrev_co_u32_e64 v190, vcc, s74, v180 +s_bitcmp1_b32 s18, 22 +s_cbranch_scc0 66 +s_bitset0_b32 s18, 22 +s_bfe_u32 s52, s18, 0x10014 +v_mul_u32_u24_e32 v194, 3, v189 +v_mul_u32_u24_e32 v195, 3, v190 +v_cvt_pk_u16_u32_e64 v197, v194, v195 +v_and_b32_e64 v194, v1, 1 +v_cmp_eq_u32_e64 vcc, v194, 1 +v_cndmask_b32_e32 v197, v181, v197, vcc +v_lshrrev_b32_e32 v193, 1, v1 +v_bfe_u32 v198, v193, s52, 1 +v_lshrrev_b32_e32 v193, 1, v1 +v_bfi_b32 v193, 1, v1, v193 +v_lshrrev_b32_e32 v194, 2, v1 +v_bfi_b32 v194, 1, v1, v194 +v_cmp_eq_u32_e64 vcc, s52, 0 +v_cndmask_b32_e32 v193, v194, v193, vcc +s_sub_u32 s52, 1, s52 +v_lshrrev_b32_e32 v194, s52, v193 +v_bfi_b32 v193, 32, v194, v193 +v_and_b32_e32 v193, 63, v193 +v_add_co_u32_e64 v194, vcc, 16, v193 +v_and_b32_e64 v195, v1, 2 +v_cmp_eq_u32_e64 vcc, v195, 0 +v_cndmask_b32_e32 v194, v194, v193, vcc +v_lshlrev_b32_e32 v195, 14, v198 +v_mad_u32_u24 v194, 4, v194, v195 +v_add_co_u32_e64 v193, vcc, s96, v194 +ds_write_b32 v193, v197 +v_writelane_b32 v195, s18, 0 +v_writelane_b32 v195, s85, 1 +v_writelane_b32 v195, s84, 2 +v_and_b32_e64 v193, v1, 63 +v_cmp_ge_u32_e64 vcc, v193, 3 +v_mov_b32_e32 v196, 0x4000 +v_cndmask_b32_e32 v193, v193, v196, vcc +v_mad_i32_i24 v193, v193, 4, s96 +ds_write_b32 v193, v195 offset:256 +s_add_u32 s96, s96, 0x18c +s_cmp_eq_u32 s96, 0xffc0 +s_cselect_b32 s96, 0xc1e0, s96 +v_mov_b32_dpp v191, v181 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v189, v189 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v190, v190 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_readfirstlane_b32 s81, v191 +v_sub_co_u32_e64 v192, vcc, v191, s81 +v_mul_lo_u32 v192, v192, s65 +v_and_b32_e64 v196, v1, 3 +v_ashrrev_i32_e64 v197, 0, s31 +v_subrev_co_u32_e64 v196, vcc, v197, v196 +v_ashrrev_i32_e64 v197, 0, s62 +v_mad_i32_i24 v193, v197, 2, v196 +s_bfe_u32 s52, s18, 0x10014 +v_lshrrev_b32_e32 v195, 2, v1 +v_and_b32_e32 v195, s52, v195 +v_mad_i32_i24 v193, v195, 2, v193 +v_add_co_u32_e64 v194, vcc, 0, s63 +v_ashrrev_i32_e32 v194, 0, v194 +v_add_co_u32_e64 v195, vcc, 0, s30 +v_ashrrev_i32_e32 v195, 0, v195 +v_sub_nc_i32 v194, v194, v195 +s_lshl_b32 s54, s15, 2 +v_cmp_ge_u32_e64 s[52:53], v191, s12 +v_mad_i32_i24 v189, v189, 3, v193 +v_cmp_ge_u32_e64 s[56:57], v189, s15 +v_mad_i32_i24 v189, 4, v189, v192 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_mad_i32_i24 v190, v190, 3, v194 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v149, v190, s54, v189 +v_cndmask_b32_e64 v149, v149, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v150, v190, s54, v189 +v_cndmask_b32_e64 v150, v150, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v151, v190, s54, v189 +v_cndmask_b32_e64 v151, v151, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v152, v190, s54, v189 +v_cndmask_b32_e64 v152, v152, -1, s[58:59] +s_bitcmp1_b32 s18, 20 +s_cbranch_scc0 60 +v_mov_b32_dpp v191, v181 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v189, v179 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v190, v180 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xf +v_cmp_ge_u32_e64 s[52:53], v191, s12 +v_sub_co_u32_e64 v192, vcc, v191, s81 +v_mul_lo_u32 v192, v192, s65 +v_sub_co_u32_e64 v189, vcc, v189, s75 +v_sub_co_u32_e64 v190, vcc, v190, s74 +v_mad_i32_i24 v189, v189, 3, v193 +v_cmp_ge_u32_e64 s[56:57], v189, s15 +v_mad_i32_i24 v189, 4, v189, v192 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_mad_i32_i24 v190, v190, 3, v194 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v153, v190, s54, v189 +v_cndmask_b32_e64 v153, v153, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v154, v190, s54, v189 +v_cndmask_b32_e64 v154, v154, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v155, v190, s54, v189 +v_cndmask_b32_e64 v155, v155, -1, s[58:59] +v_add_co_u32_e64 v190, vcc, 1, v190 +v_cmp_ge_u32_e64 s[58:59], v190, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v156, v190, s54, v189 +v_cndmask_b32_e64 v156, v156, -1, s[58:59] +s_branch 26 +s_bitcmp1_b32 s18, 24 +s_cselect_b32 s52, s68, 0 +v_add_co_u32_e64 v189, vcc, v149, s52 +v_cmp_eq_u32_e64 vcc, v149, -1 +v_cndmask_b32_e64 v153, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v150, s52 +v_cmp_eq_u32_e64 vcc, v150, -1 +v_cndmask_b32_e64 v154, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v151, s52 +v_cmp_eq_u32_e64 vcc, v151, -1 +v_cndmask_b32_e64 v155, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v152, s52 +v_cmp_eq_u32_e64 vcc, v152, -1 +v_cndmask_b32_e64 v156, v189, -1, vcc +s_bitcmp1_b32 s18, 18 +s_cbranch_scc1 154 +s_lshr_b32 s52, -1, 16 +s_and_b32 s52, s52, s65 +s_lshr_b32 s53, s65, 16 +s_mul_i32 s53, s53, s81 +s_mul_i32 s40, s52, s81 +s_lshl_b32 s52, s53, 16 +s_lshr_b32 s53, s53, 16 +s_add_u32 s40, s52, s40 +s_addc_u32 s41, s53, 0 +s_add_u32 s40, s40, s20 +s_addc_u32 s41, s41, s21 +s_branch 130 +s_bitcmp1_b32 s18, 18 +s_cbranch_scc1 140 +s_bfe_u32 s52, s18, 0x10014 +v_xor_b32_dpp v189, v1, v1 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xf +v_bfe_u32 v191, v1, 2, s52 +v_mad_u32_u24 v189, v191, 2, v189 +v_mad_u32_u24 v189, s62, 2, v189 +v_sub_co_u32_e64 v191, vcc, s29, v189 +v_sub_co_u32_e64 v191, vcc, v191, 1 +s_bfe_u32 s54, s18, 0x10001 +v_cmp_eq_u32_e64 vcc, s54, 1 +v_cndmask_b32_e32 v189, v189, v191, vcc +v_cmp_ge_u32_e64 s[52:53], v189, s29 +v_lshlrev_b32_e32 v189, 2, v189 +s_bfe_u32 s54, s18, 0x10018 +v_bfe_u32 v192, v1, 2, s54 +v_mul_lo_u32 v192, s68, v192 +v_add_co_u32_e64 v189, vcc, v189, v192 +v_mul_lo_u32 v190, s90, v182 +v_add_co_u32_e64 v190, vcc, v190, v189 +s_sub_u32 s54, s28, s63 +s_sub_u32 s54, s54, 2 +s_bitcmp1_b32 s18, 0 +s_cselect_b32 s54, s54, s63 +v_mov_b32_e32 v192, s54 +s_lshl_b32 s57, s29, 2 +v_cmp_ge_u32_e64 s[54:55], v192, s28 +v_mad_i32_i24 v149, v192, s57, v190 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v149, v149, -1, s[54:55] +v_mov_b32_e32 v150, v149 +v_add_co_u32_e64 v192, vcc, v192, 1 +v_cmp_ge_u32_e64 s[54:55], v192, s28 +v_mad_i32_i24 v152, v192, s57, v190 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v152, v152, -1, s[54:55] +v_add_co_u32_e64 v192, vcc, v192, 1 +v_cmp_ge_u32_e64 s[54:55], v192, s28 +v_mad_i32_i24 v151, v192, s57, v190 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v151, v151, -1, s[54:55] +s_bitcmp1_b32 s18, 0 +s_cselect_b64 vcc, -1, 0 +v_cndmask_b32_e32 v149, v150, v152, vcc +v_cndmask_b32_e32 v152, v152, v150, vcc +s_lshl_b32 s52, s90, 3 +s_and_b32 s53, s18, 0x1100000 +s_cselect_b32 s52, s52, 0 +v_add_co_u32_e64 v189, vcc, v149, s52 +v_cmp_eq_u32_e64 vcc, v149, -1 +v_cndmask_b32_e64 v153, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v150, s52 +v_cmp_eq_u32_e64 vcc, v150, -1 +v_cndmask_b32_e64 v154, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v151, s52 +v_cmp_eq_u32_e64 vcc, v151, -1 +v_cndmask_b32_e64 v155, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v152, s52 +v_cmp_eq_u32_e64 vcc, v152, -1 +v_cndmask_b32_e64 v156, v189, -1, vcc +v_add_co_u32_e64 v189, vcc, v182, s83 +v_cmp_lt_u32_e64 vcc, v189, s16 +v_cndmask_b32_e32 v149, -1, v149, vcc +v_cndmask_b32_e32 v150, -1, v150, vcc +v_cndmask_b32_e32 v151, -1, v151, vcc +v_cndmask_b32_e32 v152, -1, v152, vcc +s_and_b32 s52, s18, 0x1100000 +s_cbranch_scc0 4 +v_add_co_u32_e64 v189, vcc, v189, 8 +v_cmp_lt_u32_e64 vcc, v189, s16 +v_cndmask_b32_e32 v153, -1, v153, vcc +v_cndmask_b32_e32 v154, -1, v154, vcc +v_cndmask_b32_e32 v155, -1, v155, vcc +v_cndmask_b32_e32 v156, -1, v156, vcc +s_lshr_b32 s52, -1, 16 +s_and_b32 s52, s52, s90 +s_lshr_b32 s53, s90, 16 +s_mul_i32 s53, s53, s83 +s_mul_i32 s40, s52, s83 +s_lshl_b32 s52, s53, 16 +s_lshr_b32 s53, s53, 16 +s_add_u32 s40, s52, s40 +s_addc_u32 s41, s53, 0 +s_add_u32 s40, s40, s22 +s_addc_u32 s41, s41, s23 +s_mov_b32 s43, 0x31014000 +s_mov_b32 s73, -1 +s_bfe_u32 s52, s18, 0x10014 +s_lshl_b32 s82, s13, s52 +s_bitcmp1_b32 s18, 20 +s_cselect_b32 s52, 0, 0x2000000 +s_bitcmp1_b32 s13, 0 +s_cselect_b32 s52, s52, 0 +s_xor_b32 s18, s18, s52 +s_branch 64819 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_and_b32 s52, 0x900000, s18 +s_subb_u32 s62, s62, 1 +s_cbranch_scc0 65124 +s_and_b32 s52, 0x900000, s18 +s_subb_u32 s62, s61, 1 +s_add_u32 s63, s63, 2 +s_cmp_ge_u32 s63, s28 +s_cbranch_scc0 65118 +s_mov_b32 s63, 0 +s_branch 65080 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_mov_b32 s52, 0x3c3c3c3c +s_mov_b32 s53, s52 +v_mov_b32_e32 v190, v3 +v_mov_b32_e32 v191, v4 +v_mov_b32_e32 v192, v5 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v2 +v_add_f32_dpp v189, v2, v2 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v3, v3 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v4, v4 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v5, v5 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v4, v4, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v5, v5, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v2, v2, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v3, v3, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v3, v4 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v2, v5 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v3, v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v2, v2 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v2, v2, v2 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v5, v190 +v_add_f32_dpp v5, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v4, v189 +v_add_f32_dpp v4, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v3, v3, v2 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v2, v5, v4 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v4, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v4, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v3, v191, v3, s[52:53] +v_mov_b32_dpp v4, v4 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v4, v4 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v7 +v_mov_b32_e32 v191, v8 +v_mov_b32_e32 v192, v9 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v6 +v_add_f32_dpp v189, v6, v6 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v7, v7 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v8, v8 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v9, v9 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v8, v8, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v9, v9, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v6, v6, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v7, v7, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v7, v8 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v6, v9 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v7, v7 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v7, v7, v7 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v6, v6 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v6, v6, v6 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v9, v190 +v_add_f32_dpp v9, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v8, v189 +v_add_f32_dpp v8, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v7, v7, v6 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v5, v9, v8 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v8, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v8, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v6, v191, v7, s[52:53] +v_mov_b32_dpp v7, v8 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v7, v8 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v11 +v_mov_b32_e32 v191, v12 +v_mov_b32_e32 v192, v13 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v10 +v_add_f32_dpp v189, v10, v10 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v11, v11 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v12, v12 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v13, v13 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v12, v12, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v13, v13, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v10, v10, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v11, v11, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v11, v12 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v10, v13 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v11, v11 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v11, v11, v11 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v10, v10 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v10, v10, v10 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v13, v190 +v_add_f32_dpp v13, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v12, v189 +v_add_f32_dpp v12, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v11, v11, v10 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v8, v13, v12 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v12, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v12, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v9, v191, v11, s[52:53] +v_mov_b32_dpp v10, v12 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v10, v12 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v15 +v_mov_b32_e32 v191, v16 +v_mov_b32_e32 v192, v17 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v14 +v_add_f32_dpp v189, v14, v14 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v15, v15 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v16, v16 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v17, v17 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v16, v16, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v17, v17, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v14, v14, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v15, v15, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v15, v16 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v14, v17 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v15, v15 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v15, v15, v15 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v14, v14 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v14, v14, v14 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v17, v190 +v_add_f32_dpp v17, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v16, v189 +v_add_f32_dpp v16, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v15, v15, v14 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v11, v17, v16 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v16, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v16, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v12, v191, v15, s[52:53] +v_mov_b32_dpp v13, v16 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v13, v16 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v19 +v_mov_b32_e32 v191, v20 +v_mov_b32_e32 v192, v21 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v18 +v_add_f32_dpp v189, v18, v18 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v19, v19 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v20, v20 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v21, v21 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v20, v20, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v21, v21, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v18, v18, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v19, v19, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v19, v20 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v18, v21 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v19, v19 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v19, v19, v19 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v18, v18 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v18, v18, v18 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v21, v190 +v_add_f32_dpp v21, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v20, v189 +v_add_f32_dpp v20, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v19, v19, v18 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v14, v21, v20 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v20, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v20, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v15, v191, v19, s[52:53] +v_mov_b32_dpp v16, v20 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v16, v20 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v23 +v_mov_b32_e32 v191, v24 +v_mov_b32_e32 v192, v25 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v22 +v_add_f32_dpp v189, v22, v22 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v23, v23 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v24, v24 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v25, v25 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v24, v24, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v25, v25, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v22, v22, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v23, v23, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v23, v24 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v22, v25 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v23, v23 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v23, v23, v23 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v22, v22 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v22, v22, v22 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v25, v190 +v_add_f32_dpp v25, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v24, v189 +v_add_f32_dpp v24, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v23, v23, v22 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v17, v25, v24 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v24, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v24, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v18, v191, v23, s[52:53] +v_mov_b32_dpp v19, v24 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v19, v24 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v27 +v_mov_b32_e32 v191, v28 +v_mov_b32_e32 v192, v29 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v26 +v_add_f32_dpp v189, v26, v26 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v27, v27 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v28, v28 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v29, v29 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v28, v28, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v29, v29, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v26, v26, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v27, v27, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v27, v28 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v26, v29 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v27, v27 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v27, v27, v27 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v26, v26 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v26, v26, v26 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v29, v190 +v_add_f32_dpp v29, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v28, v189 +v_add_f32_dpp v28, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v27, v27, v26 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v20, v29, v28 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v28, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v28, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v21, v191, v27, s[52:53] +v_mov_b32_dpp v22, v28 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v22, v28 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v31 +v_mov_b32_e32 v191, v32 +v_mov_b32_e32 v192, v33 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v30 +v_add_f32_dpp v189, v30, v30 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v31, v31 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v32, v32 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v33, v33 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v32, v32, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v33, v33, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v30, v30, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v31, v31, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v31, v32 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v30, v33 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v31, v31 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v31, v31, v31 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v30, v30 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v30, v30, v30 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v33, v190 +v_add_f32_dpp v33, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v32, v189 +v_add_f32_dpp v32, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v31, v31, v30 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v23, v33, v32 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v32, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v32, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v24, v191, v31, s[52:53] +v_mov_b32_dpp v25, v32 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v25, v32 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v35 +v_mov_b32_e32 v191, v36 +v_mov_b32_e32 v192, v37 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v34 +v_add_f32_dpp v189, v34, v34 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v35, v35 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v36, v36 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v37, v37 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v36, v36, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v37, v37, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v34, v34, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v35, v35, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v35, v36 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v34, v37 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v35, v35 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v35, v35, v35 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v34, v34 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v34, v34, v34 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v37, v190 +v_add_f32_dpp v37, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v36, v189 +v_add_f32_dpp v36, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v35, v35, v34 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v26, v37, v36 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v36, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v36, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v27, v191, v35, s[52:53] +v_mov_b32_dpp v28, v36 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v28, v36 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v39 +v_mov_b32_e32 v191, v40 +v_mov_b32_e32 v192, v41 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v38 +v_add_f32_dpp v189, v38, v38 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v39, v39 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v40, v40 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v41, v41 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v40, v40, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v41, v41, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v38, v38, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v39, v39, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v39, v40 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v38, v41 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v39, v39 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v39, v39, v39 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v38, v38 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v38, v38, v38 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v41, v190 +v_add_f32_dpp v41, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v40, v189 +v_add_f32_dpp v40, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v39, v39, v38 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v29, v41, v40 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v40, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v40, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v30, v191, v39, s[52:53] +v_mov_b32_dpp v31, v40 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v31, v40 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v43 +v_mov_b32_e32 v191, v44 +v_mov_b32_e32 v192, v45 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v42 +v_add_f32_dpp v189, v42, v42 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v43, v43 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v44, v44 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v45, v45 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v44, v44, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v45, v45, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v42, v42, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v43, v43, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v43, v44 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v42, v45 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v43, v43 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v43, v43, v43 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v42, v42 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v42, v42, v42 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v45, v190 +v_add_f32_dpp v45, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v44, v189 +v_add_f32_dpp v44, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v43, v43, v42 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v32, v45, v44 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v44, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v44, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v33, v191, v43, s[52:53] +v_mov_b32_dpp v34, v44 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v34, v44 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v47 +v_mov_b32_e32 v191, v48 +v_mov_b32_e32 v192, v49 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v46 +v_add_f32_dpp v189, v46, v46 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v47, v47 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v48, v48 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v49, v49 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v48, v48, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v49, v49, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v46, v46, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v47, v47, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v47, v48 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v46, v49 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v47, v47 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v47, v47, v47 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v46, v46 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v46, v46, v46 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v49, v190 +v_add_f32_dpp v49, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v48, v189 +v_add_f32_dpp v48, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v47, v47, v46 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v35, v49, v48 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v48, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v48, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v36, v191, v47, s[52:53] +v_mov_b32_dpp v37, v48 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v37, v48 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v51 +v_mov_b32_e32 v191, v52 +v_mov_b32_e32 v192, v53 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v50 +v_add_f32_dpp v189, v50, v50 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v51, v51 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v52, v52 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v53, v53 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v52, v52, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v53, v53, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v50, v50, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v51, v51, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v51, v52 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v50, v53 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v51, v51 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v51, v51, v51 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v50, v50 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v50, v50, v50 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v53, v190 +v_add_f32_dpp v53, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v52, v189 +v_add_f32_dpp v52, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v51, v51, v50 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v38, v53, v52 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v52, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v52, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v39, v191, v51, s[52:53] +v_mov_b32_dpp v40, v52 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v40, v52 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v55 +v_mov_b32_e32 v191, v56 +v_mov_b32_e32 v192, v57 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v54 +v_add_f32_dpp v189, v54, v54 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v55, v55 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v56, v56 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v57, v57 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v56, v56, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v57, v57, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v54, v54, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v55, v55, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v55, v56 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v54, v57 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v55, v55 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v55, v55, v55 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v54, v54 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v54, v54, v54 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v57, v190 +v_add_f32_dpp v57, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v56, v189 +v_add_f32_dpp v56, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v55, v55, v54 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v41, v57, v56 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v56, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v56, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v42, v191, v55, s[52:53] +v_mov_b32_dpp v43, v56 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v43, v56 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v59 +v_mov_b32_e32 v191, v60 +v_mov_b32_e32 v192, v61 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v58 +v_add_f32_dpp v189, v58, v58 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v59, v59 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v60, v60 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v61, v61 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v60, v60, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v61, v61, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v58, v58, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v59, v59, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v59, v60 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v58, v61 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v59, v59 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v59, v59, v59 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v58, v58 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v58, v58, v58 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v61, v190 +v_add_f32_dpp v61, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v60, v189 +v_add_f32_dpp v60, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v59, v59, v58 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v44, v61, v60 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v60, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v60, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v45, v191, v59, s[52:53] +v_mov_b32_dpp v46, v60 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v46, v60 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v190, v63 +v_mov_b32_e32 v191, v64 +v_mov_b32_e32 v192, v65 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v189, v62 +v_add_f32_dpp v189, v62, v62 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v190, v63, v63 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v191, v64, v64 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v192, v65, v65 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v64, v64, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v65, v65, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v62, v62, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_fmac_f32_dpp v63, v63, v184 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v63, v64 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v62, v65 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v190, v191, v190 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v189, v192, v189 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v192, v63, v63 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v63, v63, v63 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v191, v62, v62 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v62, v62, v62 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v65, v190 +v_add_f32_dpp v65, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v192 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v191, v192 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v64, v189 +v_add_f32_dpp v64, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v191, v191 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v192, v190, v190 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v63, v63, v62 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v47, v65, v64 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v64, v189, v189 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v64, v192 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v191, v191 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v48, v191, v63, s[52:53] +v_mov_b32_dpp v49, v64 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v49, v64 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +s_waitcnt vmcnt(0) +v_readlane_b32 s55, v187, 0 +v_add_f32_e64 v2, v2, s55 +v_mul_f32_e64 v189, v2, s36 +v_cmp_lt_f32_e64 vcc, v2, 0 +v_cndmask_b32_e32 v2, v2, v189, vcc +v_add_f32_e64 v5, v5, s55 +v_mul_f32_e64 v189, v5, s36 +v_cmp_lt_f32_e64 vcc, v5, 0 +v_cndmask_b32_e32 v5, v5, v189, vcc +buffer_store_dword v2, v157, s[44:47], 0 offen +buffer_store_dword v5, v161, s[44:47], 0 offen +v_add_f32_e64 v3, v3, s55 +v_mul_f32_e64 v189, v3, s36 +v_cmp_lt_f32_e64 vcc, v3, 0 +v_cndmask_b32_e32 v3, v3, v189, vcc +v_add_f32_e64 v6, v6, s55 +v_mul_f32_e64 v189, v6, s36 +v_cmp_lt_f32_e64 vcc, v6, 0 +v_cndmask_b32_e32 v6, v6, v189, vcc +buffer_store_dword v3, v158, s[44:47], 0 offen +buffer_store_dword v6, v162, s[44:47], 0 offen +v_add_f32_e64 v4, v4, s55 +v_mul_f32_e64 v189, v4, s36 +v_cmp_lt_f32_e64 vcc, v4, 0 +v_cndmask_b32_e32 v4, v4, v189, vcc +v_add_f32_e64 v7, v7, s55 +v_mul_f32_e64 v189, v7, s36 +v_cmp_lt_f32_e64 vcc, v7, 0 +v_cndmask_b32_e32 v7, v7, v189, vcc +buffer_store_dword v4, v159, s[44:47], 0 offen +buffer_store_dword v7, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 1 +v_add_f32_e64 v8, v8, s55 +v_mul_f32_e64 v189, v8, s36 +v_cmp_lt_f32_e64 vcc, v8, 0 +v_cndmask_b32_e32 v8, v8, v189, vcc +v_add_f32_e64 v11, v11, s55 +v_mul_f32_e64 v189, v11, s36 +v_cmp_lt_f32_e64 vcc, v11, 0 +v_cndmask_b32_e32 v11, v11, v189, vcc +buffer_store_dword v8, v157, s[44:47], 0 offen +buffer_store_dword v11, v161, s[44:47], 0 offen +v_add_f32_e64 v9, v9, s55 +v_mul_f32_e64 v189, v9, s36 +v_cmp_lt_f32_e64 vcc, v9, 0 +v_cndmask_b32_e32 v9, v9, v189, vcc +v_add_f32_e64 v12, v12, s55 +v_mul_f32_e64 v189, v12, s36 +v_cmp_lt_f32_e64 vcc, v12, 0 +v_cndmask_b32_e32 v12, v12, v189, vcc +buffer_store_dword v9, v158, s[44:47], 0 offen +buffer_store_dword v12, v162, s[44:47], 0 offen +v_add_f32_e64 v10, v10, s55 +v_mul_f32_e64 v189, v10, s36 +v_cmp_lt_f32_e64 vcc, v10, 0 +v_cndmask_b32_e32 v10, v10, v189, vcc +v_add_f32_e64 v13, v13, s55 +v_mul_f32_e64 v189, v13, s36 +v_cmp_lt_f32_e64 vcc, v13, 0 +v_cndmask_b32_e32 v13, v13, v189, vcc +buffer_store_dword v10, v159, s[44:47], 0 offen +buffer_store_dword v13, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 2 +v_add_f32_e64 v14, v14, s55 +v_mul_f32_e64 v189, v14, s36 +v_cmp_lt_f32_e64 vcc, v14, 0 +v_cndmask_b32_e32 v14, v14, v189, vcc +v_add_f32_e64 v17, v17, s55 +v_mul_f32_e64 v189, v17, s36 +v_cmp_lt_f32_e64 vcc, v17, 0 +v_cndmask_b32_e32 v17, v17, v189, vcc +buffer_store_dword v14, v157, s[44:47], 0 offen +buffer_store_dword v17, v161, s[44:47], 0 offen +v_add_f32_e64 v15, v15, s55 +v_mul_f32_e64 v189, v15, s36 +v_cmp_lt_f32_e64 vcc, v15, 0 +v_cndmask_b32_e32 v15, v15, v189, vcc +v_add_f32_e64 v18, v18, s55 +v_mul_f32_e64 v189, v18, s36 +v_cmp_lt_f32_e64 vcc, v18, 0 +v_cndmask_b32_e32 v18, v18, v189, vcc +buffer_store_dword v15, v158, s[44:47], 0 offen +buffer_store_dword v18, v162, s[44:47], 0 offen +v_add_f32_e64 v16, v16, s55 +v_mul_f32_e64 v189, v16, s36 +v_cmp_lt_f32_e64 vcc, v16, 0 +v_cndmask_b32_e32 v16, v16, v189, vcc +v_add_f32_e64 v19, v19, s55 +v_mul_f32_e64 v189, v19, s36 +v_cmp_lt_f32_e64 vcc, v19, 0 +v_cndmask_b32_e32 v19, v19, v189, vcc +buffer_store_dword v16, v159, s[44:47], 0 offen +buffer_store_dword v19, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 3 +v_add_f32_e64 v20, v20, s55 +v_mul_f32_e64 v189, v20, s36 +v_cmp_lt_f32_e64 vcc, v20, 0 +v_cndmask_b32_e32 v20, v20, v189, vcc +v_add_f32_e64 v23, v23, s55 +v_mul_f32_e64 v189, v23, s36 +v_cmp_lt_f32_e64 vcc, v23, 0 +v_cndmask_b32_e32 v23, v23, v189, vcc +buffer_store_dword v20, v157, s[44:47], 0 offen +buffer_store_dword v23, v161, s[44:47], 0 offen +v_add_f32_e64 v21, v21, s55 +v_mul_f32_e64 v189, v21, s36 +v_cmp_lt_f32_e64 vcc, v21, 0 +v_cndmask_b32_e32 v21, v21, v189, vcc +v_add_f32_e64 v24, v24, s55 +v_mul_f32_e64 v189, v24, s36 +v_cmp_lt_f32_e64 vcc, v24, 0 +v_cndmask_b32_e32 v24, v24, v189, vcc +buffer_store_dword v21, v158, s[44:47], 0 offen +buffer_store_dword v24, v162, s[44:47], 0 offen +v_add_f32_e64 v22, v22, s55 +v_mul_f32_e64 v189, v22, s36 +v_cmp_lt_f32_e64 vcc, v22, 0 +v_cndmask_b32_e32 v22, v22, v189, vcc +v_add_f32_e64 v25, v25, s55 +v_mul_f32_e64 v189, v25, s36 +v_cmp_lt_f32_e64 vcc, v25, 0 +v_cndmask_b32_e32 v25, v25, v189, vcc +buffer_store_dword v22, v159, s[44:47], 0 offen +buffer_store_dword v25, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +s_lshl_b32 s52, s67, 2 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 4 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 8 +v_add_f32_e64 v26, v26, s55 +v_mul_f32_e64 v189, v26, s36 +v_cmp_lt_f32_e64 vcc, v26, 0 +v_cndmask_b32_e32 v26, v26, v189, vcc +v_add_f32_e64 v29, v29, s55 +v_mul_f32_e64 v189, v29, s36 +v_cmp_lt_f32_e64 vcc, v29, 0 +v_cndmask_b32_e32 v29, v29, v189, vcc +buffer_store_dword v26, v157, s[44:47], 0 offen +buffer_store_dword v29, v161, s[44:47], 0 offen +v_add_f32_e64 v27, v27, s55 +v_mul_f32_e64 v189, v27, s36 +v_cmp_lt_f32_e64 vcc, v27, 0 +v_cndmask_b32_e32 v27, v27, v189, vcc +v_add_f32_e64 v30, v30, s55 +v_mul_f32_e64 v189, v30, s36 +v_cmp_lt_f32_e64 vcc, v30, 0 +v_cndmask_b32_e32 v30, v30, v189, vcc +buffer_store_dword v27, v158, s[44:47], 0 offen +buffer_store_dword v30, v162, s[44:47], 0 offen +v_add_f32_e64 v28, v28, s55 +v_mul_f32_e64 v189, v28, s36 +v_cmp_lt_f32_e64 vcc, v28, 0 +v_cndmask_b32_e32 v28, v28, v189, vcc +v_add_f32_e64 v31, v31, s55 +v_mul_f32_e64 v189, v31, s36 +v_cmp_lt_f32_e64 vcc, v31, 0 +v_cndmask_b32_e32 v31, v31, v189, vcc +buffer_store_dword v28, v159, s[44:47], 0 offen +buffer_store_dword v31, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 9 +v_add_f32_e64 v32, v32, s55 +v_mul_f32_e64 v189, v32, s36 +v_cmp_lt_f32_e64 vcc, v32, 0 +v_cndmask_b32_e32 v32, v32, v189, vcc +v_add_f32_e64 v35, v35, s55 +v_mul_f32_e64 v189, v35, s36 +v_cmp_lt_f32_e64 vcc, v35, 0 +v_cndmask_b32_e32 v35, v35, v189, vcc +buffer_store_dword v32, v157, s[44:47], 0 offen +buffer_store_dword v35, v161, s[44:47], 0 offen +v_add_f32_e64 v33, v33, s55 +v_mul_f32_e64 v189, v33, s36 +v_cmp_lt_f32_e64 vcc, v33, 0 +v_cndmask_b32_e32 v33, v33, v189, vcc +v_add_f32_e64 v36, v36, s55 +v_mul_f32_e64 v189, v36, s36 +v_cmp_lt_f32_e64 vcc, v36, 0 +v_cndmask_b32_e32 v36, v36, v189, vcc +buffer_store_dword v33, v158, s[44:47], 0 offen +buffer_store_dword v36, v162, s[44:47], 0 offen +v_add_f32_e64 v34, v34, s55 +v_mul_f32_e64 v189, v34, s36 +v_cmp_lt_f32_e64 vcc, v34, 0 +v_cndmask_b32_e32 v34, v34, v189, vcc +v_add_f32_e64 v37, v37, s55 +v_mul_f32_e64 v189, v37, s36 +v_cmp_lt_f32_e64 vcc, v37, 0 +v_cndmask_b32_e32 v37, v37, v189, vcc +buffer_store_dword v34, v159, s[44:47], 0 offen +buffer_store_dword v37, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 10 +v_add_f32_e64 v38, v38, s55 +v_mul_f32_e64 v189, v38, s36 +v_cmp_lt_f32_e64 vcc, v38, 0 +v_cndmask_b32_e32 v38, v38, v189, vcc +v_add_f32_e64 v41, v41, s55 +v_mul_f32_e64 v189, v41, s36 +v_cmp_lt_f32_e64 vcc, v41, 0 +v_cndmask_b32_e32 v41, v41, v189, vcc +buffer_store_dword v38, v157, s[44:47], 0 offen +buffer_store_dword v41, v161, s[44:47], 0 offen +v_add_f32_e64 v39, v39, s55 +v_mul_f32_e64 v189, v39, s36 +v_cmp_lt_f32_e64 vcc, v39, 0 +v_cndmask_b32_e32 v39, v39, v189, vcc +v_add_f32_e64 v42, v42, s55 +v_mul_f32_e64 v189, v42, s36 +v_cmp_lt_f32_e64 vcc, v42, 0 +v_cndmask_b32_e32 v42, v42, v189, vcc +buffer_store_dword v39, v158, s[44:47], 0 offen +buffer_store_dword v42, v162, s[44:47], 0 offen +v_add_f32_e64 v40, v40, s55 +v_mul_f32_e64 v189, v40, s36 +v_cmp_lt_f32_e64 vcc, v40, 0 +v_cndmask_b32_e32 v40, v40, v189, vcc +v_add_f32_e64 v43, v43, s55 +v_mul_f32_e64 v189, v43, s36 +v_cmp_lt_f32_e64 vcc, v43, 0 +v_cndmask_b32_e32 v43, v43, v189, vcc +buffer_store_dword v40, v159, s[44:47], 0 offen +buffer_store_dword v43, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v187, 11 +v_add_f32_e64 v44, v44, s55 +v_mul_f32_e64 v189, v44, s36 +v_cmp_lt_f32_e64 vcc, v44, 0 +v_cndmask_b32_e32 v44, v44, v189, vcc +v_add_f32_e64 v47, v47, s55 +v_mul_f32_e64 v189, v47, s36 +v_cmp_lt_f32_e64 vcc, v47, 0 +v_cndmask_b32_e32 v47, v47, v189, vcc +buffer_store_dword v44, v157, s[44:47], 0 offen +buffer_store_dword v47, v161, s[44:47], 0 offen +v_add_f32_e64 v45, v45, s55 +v_mul_f32_e64 v189, v45, s36 +v_cmp_lt_f32_e64 vcc, v45, 0 +v_cndmask_b32_e32 v45, v45, v189, vcc +v_add_f32_e64 v48, v48, s55 +v_mul_f32_e64 v189, v48, s36 +v_cmp_lt_f32_e64 vcc, v48, 0 +v_cndmask_b32_e32 v48, v48, v189, vcc +buffer_store_dword v45, v158, s[44:47], 0 offen +buffer_store_dword v48, v162, s[44:47], 0 offen +v_add_f32_e64 v46, v46, s55 +v_mul_f32_e64 v189, v46, s36 +v_cmp_lt_f32_e64 vcc, v46, 0 +v_cndmask_b32_e32 v46, v46, v189, vcc +v_add_f32_e64 v49, v49, s55 +v_mul_f32_e64 v189, v49, s36 +v_cmp_lt_f32_e64 vcc, v49, 0 +v_cndmask_b32_e32 v49, v49, v189, vcc +buffer_store_dword v46, v159, s[44:47], 0 offen +buffer_store_dword v49, v163, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_lshl_b32 s52, s52, 2 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 20 +s_cselect_b32 s47, 0, s47 +s_cselect_b32 s51, 0, s51 +s_add_u32 s48, s48, 0x80 +s_addc_u32 s49, s49, 0 +s_sub_u32 s50, s50, 0x80 +s_cselect_b32 s51, 0, s51 +v_mov_b32_e32 v2, 0 +v_mov_b32_e32 v3, 0 +v_mov_b32_e32 v4, 0 +v_mov_b32_e32 v5, 0 +v_mov_b32_e32 v6, 0 +v_mov_b32_e32 v7, 0 +v_mov_b32_e32 v8, 0 +v_mov_b32_e32 v9, 0 +v_mov_b32_e32 v10, 0 +v_mov_b32_e32 v11, 0 +v_mov_b32_e32 v12, 0 +v_mov_b32_e32 v13, 0 +v_mov_b32_e32 v14, 0 +v_mov_b32_e32 v15, 0 +v_mov_b32_e32 v16, 0 +v_mov_b32_e32 v17, 0 +v_mov_b32_e32 v18, 0 +v_mov_b32_e32 v19, 0 +v_mov_b32_e32 v20, 0 +v_mov_b32_e32 v21, 0 +v_mov_b32_e32 v22, 0 +v_mov_b32_e32 v23, 0 +v_mov_b32_e32 v24, 0 +v_mov_b32_e32 v25, 0 +v_mov_b32_e32 v26, 0 +v_mov_b32_e32 v27, 0 +v_mov_b32_e32 v28, 0 +v_mov_b32_e32 v29, 0 +v_mov_b32_e32 v30, 0 +v_mov_b32_e32 v31, 0 +v_mov_b32_e32 v32, 0 +v_mov_b32_e32 v33, 0 +v_mov_b32_e32 v34, 0 +v_mov_b32_e32 v35, 0 +v_mov_b32_e32 v36, 0 +v_mov_b32_e32 v37, 0 +v_mov_b32_e32 v38, 0 +v_mov_b32_e32 v39, 0 +v_mov_b32_e32 v40, 0 +v_mov_b32_e32 v41, 0 +v_mov_b32_e32 v42, 0 +v_mov_b32_e32 v43, 0 +v_mov_b32_e32 v44, 0 +v_mov_b32_e32 v45, 0 +v_mov_b32_e32 v46, 0 +v_mov_b32_e32 v47, 0 +v_mov_b32_e32 v48, 0 +v_mov_b32_e32 v49, 0 +v_mov_b32_e32 v50, 0 +v_mov_b32_e32 v51, 0 +v_mov_b32_e32 v52, 0 +v_mov_b32_e32 v53, 0 +v_mov_b32_e32 v54, 0 +v_mov_b32_e32 v55, 0 +v_mov_b32_e32 v56, 0 +v_mov_b32_e32 v57, 0 +v_mov_b32_e32 v58, 0 +v_mov_b32_e32 v59, 0 +v_mov_b32_e32 v60, 0 +v_mov_b32_e32 v61, 0 +v_mov_b32_e32 v62, 0 +v_mov_b32_e32 v63, 0 +v_mov_b32_e32 v64, 0 +v_mov_b32_e32 v65, 0 +s_xor_b32 s18, s18, 0x200000 +s_mul_i32 s94, s60, s61 +s_mul_i32 s94, s94, s13 +s_add_u32 s52, s93, s92 +s_cmp_lt_i32 s52, 0 +s_cbranch_scc0 270 +v_and_b32_e32 v157, 0x7f, v1 +v_lshrrev_b32_e32 v157, 1, v157 +v_bfi_b32 v157, 1, v1, v157 +v_and_b32_e64 v158, v1, 2 +v_mad_u32_u24 v157, v158, 16, v157 +v_lshlrev_b32_e32 v157, 2, v157 +v_add_co_u32_e64 v157, vcc, v157, s97 +v_and_b32_e32 v158, 3, v1 +v_lshlrev_b32_e32 v158, 2, v158 +v_add_co_u32_e64 v158, vcc, v158, s97 +ds_read_b32 v191, v158 offset:256 +ds_read_b32 v157, v157 +s_add_u32 s97, s97, 0x18c +s_cmp_eq_u32 s97, 0xffc0 +s_cselect_b32 s97, 0xc1e0, s97 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s95, v157 +v_readlane_b32 s54, v191, 0 +s_bitcmp1_b32 s54, 18 +s_cbranch_scc1 245 +v_readlane_b32 s52, v191, 1 +v_readlane_b32 s53, v191, 2 +s_add_u32 s93, s92, s53 +s_lshr_b32 s55, -1, 16 +s_and_b32 s55, s55, s66 +s_lshr_b32 s56, s66, 16 +s_mul_i32 s56, s56, s95 +s_mul_i32 s44, s55, s95 +s_lshl_b32 s55, s56, 16 +s_lshr_b32 s56, s56, 16 +s_add_u32 s44, s55, s44 +s_addc_u32 s45, s56, 0 +s_add_u32 s44, s44, s24 +s_addc_u32 s45, s45, s25 +s_mul_i32 s55, s67, s93 +s_add_u32 s44, s44, s55 +s_addc_u32 s45, s45, 0 +s_mov_b32 s47, 0x31014000 +s_bitcmp1_b32 s18, 7 +s_cselect_b32 s51, 0x31014000, 0 +s_lshl_b32 s55, s93, 2 +s_add_u32 s48, s34, s55 +s_addc_u32 s49, s35, 0 +s_lshl_b32 s56, s52, 2 +s_sub_u32 s50, s56, s55 +s_cselect_b32 s51, 0, s51 +s_sub_u32 s93, s52, s53 +s_sub_u32 s93, s93, 1 +s_sub_u32 s93, s93, s92 +s_cselect_b32 s47, 0, s47 +v_bfe_u32 v189, v157, 16, 16 +v_bfe_u32 v190, v157, 0, 16 +v_and_b32_e64 v191, v1, 7 +v_sub_co_u32_e64 v192, vcc, 7, v191 +v_min_u32_e32 v191, v191, v192 +v_bfe_u32 v192, v191, 1, 1 +v_bfe_u32 v191, v191, 0, 1 +v_mov_b32_dpp v189, v189 quad_perm:[3,3,3,3] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v190, v190 quad_perm:[3,3,3,3] row_mask:0xf bank_mask:0xf +v_add_co_u32_e64 v189, vcc, v189, v192 +v_add_co_u32_e64 v190, vcc, v190, v191 +v_mov_b32_dpp v191, v157 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xf +v_cmp_ge_u32_e64 s[52:53], v191, s12 +v_sub_co_u32_e64 v191, vcc, v191, s95 +v_mul_lo_u32 v191, v191, s66 +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v164, v1, v1 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v164, v1, v1 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_add_co_u32_e64 v164, vcc, v190, v164 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v161, v192, s33, v164 +v_lshlrev_b32_e32 v161, 2, v161 +v_add_co_u32_e64 v161, vcc, v161, v191 +v_cmp_ge_u32_e64 s[56:57], v164, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v161, v161, -1, s[56:57] +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v164, v1, v1 quad_perm:[1,1,2,2] row_mask:0xf bank_mask:0xf +v_add_co_u32_e64 v164, vcc, v190, v164 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v162, v192, s33, v164 +v_lshlrev_b32_e32 v162, 2, v162 +v_add_co_u32_e64 v162, vcc, v162, v191 +v_cmp_ge_u32_e64 s[56:57], v164, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v162, v162, -1, s[56:57] +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v164, v1, v1 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v164, v1, v1 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xa +v_add_co_u32_e64 v164, vcc, v190, v164 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v163, v192, s33, v164 +v_lshlrev_b32_e32 v163, 2, v163 +v_add_co_u32_e64 v163, vcc, v163, v191 +v_cmp_ge_u32_e64 s[56:57], v164, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v163, v163, -1, s[56:57] +v_bfe_u32 v189, v157, 16, 16 +v_bfe_u32 v190, v157, 0, 16 +v_and_b32_e64 v191, v1, 7 +v_sub_co_u32_e64 v192, vcc, 7, v191 +v_min_u32_e32 v191, v191, v192 +v_bfe_u32 v192, v191, 1, 1 +v_bfe_u32 v191, v191, 0, 1 +v_mov_b32_dpp v189, v189 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v190, v190 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +v_add_co_u32_e64 v189, vcc, v189, v192 +v_add_co_u32_e64 v190, vcc, v190, v191 +v_mov_b32_dpp v191, v157 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_cmp_ge_u32_e64 s[52:53], v191, s12 +v_sub_co_u32_e64 v191, vcc, v191, s95 +v_mul_lo_u32 v191, v191, s66 +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v160, v1, v1 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v160, v1, v1 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_add_co_u32_e64 v160, vcc, v190, v160 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v157, v192, s33, v160 +v_lshlrev_b32_e32 v157, 2, v157 +v_add_co_u32_e64 v157, vcc, v157, v191 +v_cmp_ge_u32_e64 s[56:57], v160, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v157, v157, -1, s[56:57] +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v160, v1, v1 quad_perm:[1,1,2,2] row_mask:0xf bank_mask:0xf +v_add_co_u32_e64 v160, vcc, v190, v160 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v158, v192, s33, v160 +v_lshlrev_b32_e32 v158, 2, v158 +v_add_co_u32_e64 v158, vcc, v158, v191 +v_cmp_ge_u32_e64 s[56:57], v160, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v158, v158, -1, s[56:57] +v_xor_b32_dpp v192, v1, v1 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v192, v1, v1 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v160, v1, v1 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v160, v1, v1 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xa +v_add_co_u32_e64 v160, vcc, v190, v160 +v_add_co_u32_e64 v192, vcc, v189, v192 +v_mad_i32_i24 v159, v192, s33, v160 +v_lshlrev_b32_e32 v159, 2, v159 +v_add_co_u32_e64 v159, vcc, v159, v191 +v_cmp_ge_u32_e64 s[56:57], v160, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v192, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v159, v159, -1, s[56:57] +v_and_b32_e64 v187, v1, 63 +v_lshlrev_b32_e32 v187, 2, v187 +s_barrier +buffer_load_dword v187, v187, s[48:51], 0 offen +s_branch 62853 +s_endpgm +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_code_end +s_code_end +s_code_end +s_code_end +s_code_end diff --git a/src/kernels/Conv_Winograd_v21_1_2_gfx9_f3x2_fp32_stride1_group.inc b/src/kernels/Conv_Winograd_v21_1_2_gfx9_f3x2_fp32_stride1_group.inc new file mode 100644 index 0000000000..d0aeca27c7 --- /dev/null +++ b/src/kernels/Conv_Winograd_v21_1_2_gfx9_f3x2_fp32_stride1_group.inc @@ -0,0 +1,3063 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +v_mov_b32_e32 v0, v0 +s_mov_b32 s0, 0 +s_mov_b32 s1, 0 +s_mov_b32 s2, 0 +s_mov_b32 s3, 0 +v_mov_b32_e32 v104, 0 +s_mov_b32 m0, 0x1ffff +s_mov_b32 s97, 0xc1e0 +s_mov_b32 s96, 0xc1e0 +s_mov_b32 s91, 0 +v_lshlrev_b32_e32 v107, 2, v0 +v_add_co_u32_e32 v107, vcc, 0xffc0, v107 +v_cmp_ge_u32_e32 vcc, 12, v0 +s_cbranch_vccz 5 +v_mov_b32_e32 v106, 0 +v_cndmask_b32_e32 v107, -1, v107, vcc +ds_write_b32 v107, v106 +s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +s_barrier +v_readfirstlane_b32 s52, v0 +s_lshr_b32 s52, s52, 5 +s_add_u32 s52, s52, 8 +s_and_b32 s92, s52, 20 +s_mov_b64 s[40:41], s[6:7] +s_load_dwordx16 s[12:27], s[40:41], 0x0 +s_load_dwordx4 s[28:31], s[40:41], 0x40 +s_load_dwordx2 s[32:33], s[40:41], 0x50 +s_waitcnt lgkmcnt(0) +s_and_b32 s18, s18, 0xffff +s_bitcmp1_b32 s18, 6 +s_cbranch_scc0 16 +s_and_b32 s21, s21, 0xffff +s_and_b32 s23, s23, 0xffff +s_and_b32 s25, s25, 0xffff +s_and_b32 s27, s27, 0xffff +s_load_dwordx2 s[20:21], s[20:21], 0x0 +s_load_dwordx2 s[22:23], s[22:23], 0x0 +s_load_dwordx2 s[24:25], s[24:25], 0x0 +s_load_dwordx2 s[26:27], s[26:27], 0x0 +s_bitcmp1_b32 s18, 7 +s_cbranch_scc0 2 +s_load_dwordx2 s[34:35], s[40:41], 0x58 +s_mov_b32 s36, 1.0 +s_bitcmp1_b32 s18, 8 +s_cbranch_scc0 2 +s_load_dword s36, s[40:41], 0x60 +s_bitcmp1_b32 s18, 7 +s_cbranch_scc0 7 +s_bitcmp1_b32 s18, 6 +s_cbranch_scc0 5 +s_waitcnt lgkmcnt(0) +s_and_b32 s35, s35, 0xffff +s_load_dwordx2 s[34:35], s[34:35], 0x0 +s_bitcmp1_b32 s18, 9 +s_cbranch_scc0 72 +s_mov_b32 s42, 0x8c +s_mov_b32 s43, 0x9c +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cmp_eq_u32 1, src_vccz +s_cselect_b32 s42, s43, s42 +s_load_dword s65, s[40:41], 0x88 +s_load_dword s90, s[40:41], 0x98 +s_load_dword s68, s[40:41], s42 +s_load_dwordx2 s[66:67], s[40:41], 0xa8 +s_bitcmp1_b32 s18, 10 +s_cbranch_scc0 103 +s_load_dwordx4 s[44:47], s[40:41], 0xb8 +v_ffbh_u32_e32 v4, s17 +v_lshlrev_b32_e64 v5, v4, s17 +v_and_b32_e32 v6, 0xffffff00, v5 +v_cmp_eq_u32_e32 vcc, 0x80000000, v5 +v_cvt_f32_u32_e32 v6, v6 +v_rcp_f32_e32 v2, v6 +v_subb_co_u32_e32 v3, vcc, 32, v4, vcc +v_cvt_f32_ubyte0_e32 v4, v5 +v_fma_f32 v6, v6, v2, -1.0 +v_fma_f32 v6, v4, v2, v6 +v_madak_f32 v6, v6, v2, 0x9f000000 +v_mul_f32_e32 v6, 0x5f800000, v6 +v_mov_b32_e32 v4, 0 +v_cvt_flr_i32_f32_e64 v6, -v6 +v_lshl_add_u32 v2, v2, 9, v6 +v_mad_u64_u32 v[4:5], vcc, v5, v2, v[4:5] +v_subb_co_u32_e64 v2, vcc, v2, -1, vcc +v_mul_hi_u32 v4, s8, v2 +v_add_co_u32_e64 v2, vcc, v4, s8 +v_addc_co_u32_e64 v4, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v3 +v_cndmask_b32_e32 v2, v2, v4, vcc +v_alignbit_b32 v2, v4, v2, v3 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s48, v2 +s_mul_i32 s49, s48, s17 +s_sub_u32 s8, s8, s49 +s_mul_i32 s49, s45, s48 +s_add_u32 s20, s20, s49 +s_addc_u32 s21, s21, 0 +s_mul_i32 s49, s46, s48 +s_add_u32 s22, s22, s49 +s_addc_u32 s23, s23, 0 +s_mul_i32 s49, s47, s48 +s_add_u32 s24, s24, s49 +s_addc_u32 s25, s25, 0 +s_branch 49 +s_mul_i32 s42, s14, s15 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s42 +s_lshr_b32 s47, s42, 16 +s_mul_i32 s47, s47, s13 +s_mul_i32 s44, s46, s13 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s44, s46, s44 +s_addc_u32 s45, s47, 0 +s_lshl_b32 s65, s44, 2 +s_lshl_b32 s68, s42, 2 +s_mul_i32 s43, s32, s33 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s43 +s_lshr_b32 s47, s43, 16 +s_mul_i32 s47, s47, s16 +s_mul_i32 s44, s46, s16 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s44, s46, s44 +s_addc_u32 s45, s47, 0 +s_lshl_b32 s66, s44, 2 +s_lshl_b32 s67, s43, 2 +s_bitcmp1_b32 s18, 13 +s_cbranch_scc0 2 +s_load_dwordx8 s[48:55], s[40:41], 0x68 +s_mul_i32 s42, s28, s29 +s_lshl_b32 s42, s42, 2 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s43, s16, s13 +s_lshr_b32 s44, -1, 16 +s_and_b32 s44, s44, s42 +s_lshr_b32 s45, s42, 16 +s_mul_i32 s45, s45, s43 +s_mul_i32 s56, s44, s43 +s_lshl_b32 s44, s45, 16 +s_lshr_b32 s45, s45, 16 +s_add_u32 s56, s44, s56 +s_addc_u32 s57, s45, 0 +s_mov_b32 s43, s56 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s44, s43, s42 +s_cselect_b32 s90, s42, s43 +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cmp_eq_u32 1, src_vccz +s_cselect_b32 s68, s44, s68 +s_waitcnt lgkmcnt(0) +s_and_b32 s21, s21, 0xffff +s_and_b32 s23, s23, 0xffff +s_and_b32 s25, s25, 0xffff +s_and_b32 s27, s27, 0xffff +s_and_b32 s35, s35, 0xffff +s_bitcmp1_b32 s18, 13 +s_cbranch_scc0 8 +s_add_u32 s20, s20, s48 +s_addc_u32 s21, s21, s49 +s_add_u32 s22, s22, s50 +s_addc_u32 s23, s23, s51 +s_add_u32 s24, s24, s52 +s_addc_u32 s25, s25, s53 +s_add_u32 s34, s34, s54 +s_addc_u32 s35, s35, s55 +s_and_b32 s44, 0, s30 +s_addc_u32 s44, s32, 0 +s_ashr_i32 s44, s44, 0 +s_add_u32 s42, s44, 2 +v_mov_b32_e32 v2, 0x55555556 +v_mul_hi_u32 v2, v2, s42 +v_readfirstlane_b32 s42, v2 +s_andn2_b32 s44, 0, s31 +s_addc_u32 s44, s33, 0 +s_ashr_i32 s44, s44, 0 +s_add_u32 s43, s44, 2 +v_mov_b32_e32 v2, 0x55555556 +v_mul_hi_u32 v2, v2, s43 +v_readfirstlane_b32 s43, v2 +s_sub_u32 s75, 0, s43 +s_sub_u32 s74, 0, s42 +s_add_u32 s60, s28, 1 +v_mov_b32_e32 v2, 0x80000000 +v_mul_hi_u32 v2, v2, s60 +v_readfirstlane_b32 s60, v2 +s_add_u32 s61, s29, 1 +v_mov_b32_e32 v2, 0x80000000 +v_mul_hi_u32 v2, v2, s61 +v_readfirstlane_b32 s61, v2 +v_mad_i32_i24 v2, 2, s60, -1 +v_sub_co_u32_e64 v2, vcc, v2, s28 +v_addc_co_u32_e64 v2, vcc, 0, 0, vcc +v_readfirstlane_b32 s44, v2 +s_and_b32 s44, s44, 0 +s_and_b32 s44, s44, s60 +s_add_u32 s60, s60, s44 +v_readfirstlane_b32 s45, v0 +s_and_b32 s48, s45, 64 +s_cselect_b32 s48, 0x80000, 0 +s_or_b32 s18, s18, s48 +s_lshl_b32 s69, s68, 1 +s_mov_b64 s[70:71], 0 +s_bitcmp1_b32 s18, 12 +s_cselect_b32 s44, 0, -1 +s_bitcmp1_b32 s18, 11 +s_cselect_b32 s44, s44, 1 +s_cmp_gt_u32 s61, s44 +s_cbranch_scc0 8 +s_bitset1_b32 s18, 23 +s_bitset1_b32 s18, 20 +s_bitset0_b32 s18, 19 +s_ashr_i32 s69, s69, 1 +s_ashr_i64 s[70:71], s[70:71], 1 +s_add_u32 s61, s61, 1 +s_and_b32 s61, s61, -2 +s_branch 16 +s_and_b32 s48, s13, 1 +s_cselect_b32 s48, 0, 0x1000000 +s_bitcmp1_b32 s18, 2 +s_cselect_b32 s48, 0, s48 +s_or_b32 s18, s18, s48 +s_cmp_eq_u32 s48, 0 +s_cselect_b32 s69, s68, s69 +s_cselect_b32 s70, s68, s70 +s_cselect_b32 s71, 0, s71 +s_bitcmp0_b32 s45, 8 +s_cselect_b32 s48, s48, 0 +s_cmp_eq_u32 s48, 0 +s_cselect_b32 s48, 0, 0x80000 +s_andn2_b32 s18, s18, s48 +s_add_u32 s70, s70, s69 +s_addc_u32 s71, s71, 0 +v_bfe_u32 v3, v0, 2, 6 +v_lshrrev_b32_e32 v99, 1, v3 +s_bitcmp0_b32 s45, 8 +s_cselect_b32 s48, 0x1000000, 0 +s_or_b32 s48, s48, 0x100000 +s_and_b32 s48, s18, s48 +s_cselect_b32 s48, 0, 15 +v_bfi_b32 v99, s48, v3, v99 +s_mul_i32 s88, s12, s42 +s_sub_u32 s88, s88, 1 +s_lshr_b32 s88, s88, 0 +s_add_u32 s88, s88, 1 +s_lshr_b32 s46, -1, 16 +s_and_b32 s46, s46, s88 +s_lshr_b32 s47, s88, 16 +s_mul_i32 s47, s47, s43 +s_mul_i32 s88, s46, s43 +s_lshl_b32 s46, s47, 16 +s_lshr_b32 s47, s47, 16 +s_add_u32 s88, s46, s88 +s_addc_u32 s89, s47, 0 +s_sub_u32 s88, s88, 1 +s_subb_u32 s89, s89, 0 +s_lshr_b64 s[88:89], s[88:89], 5 +s_add_u32 s88, s88, 1 +s_addc_u32 s89, s89, 0 +v_mov_b32_e32 v4, s8 +v_mov_b32_e32 v5, s17 +v_and_b32_e32 v6, 3, v0 +v_cmp_eq_u32_e32 vcc, 2, v6 +v_cndmask_b32_e32 v4, v4, v5, vcc +v_cmp_eq_u32_e32 vcc, 1, v6 +v_cndmask_b32_e32 v7, 0, v99, vcc +s_bitcmp1_b32 s18, 20 +s_cbranch_scc0 4 +v_add_co_u32_e64 v5, vcc, v99, 8 +v_cmp_eq_u32_e32 vcc, 0, v6 +v_cndmask_b32_e32 v7, v7, v5, vcc +v_cmp_eq_u32_e64 s[46:47], 3, v6 +v_bfe_u32 v97, v7, 0, 5 +v_mad_u32_u24 v97, v4, 32, v97 +v_ffbh_u32_e32 v9, s43 +v_lshlrev_b32_e64 v10, v9, s43 +v_and_b32_e32 v11, 0xffffff00, v10 +v_cmp_eq_u32_e32 vcc, 0x80000000, v10 +v_cvt_f32_u32_e32 v11, v11 +v_rcp_f32_e32 v98, v11 +v_subb_co_u32_e32 v8, vcc, 32, v9, vcc +v_cvt_f32_ubyte0_e32 v9, v10 +v_fma_f32 v11, v11, v98, -1.0 +v_fma_f32 v11, v9, v98, v11 +v_madak_f32 v11, v11, v98, 0x9f000000 +v_mul_f32_e32 v11, 0x5f800000, v11 +v_mov_b32_e32 v9, 0 +v_cvt_flr_i32_f32_e64 v11, -v11 +v_lshl_add_u32 v98, v98, 9, v11 +v_mad_u64_u32 v[9:10], vcc, v10, v98, v[9:10] +v_subb_co_u32_e64 v98, vcc, v98, -1, vcc +v_mul_hi_u32 v9, v97, v98 +v_add_co_u32_e32 v98, vcc, v9, v97 +v_addc_co_u32_e64 v9, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v8 +v_cndmask_b32_e32 v98, v98, v9, vcc +v_alignbit_b32 v98, v9, v98, v8 +v_mad_i32_i24 v96, v98, s75, v97 +v_lshrrev_b32_e32 v97, 5, v7 +v_mad_u32_u24 v97, v98, 1, v97 +v_cndmask_b32_e64 v97, v97, 1, s[46:47] +v_ffbh_u32_e32 v9, s42 +v_lshlrev_b32_e64 v10, v9, s42 +v_and_b32_e32 v11, 0xffffff00, v10 +v_cmp_eq_u32_e32 vcc, 0x80000000, v10 +v_cvt_f32_u32_e32 v11, v11 +v_rcp_f32_e32 v98, v11 +v_subb_co_u32_e32 v8, vcc, 32, v9, vcc +v_cvt_f32_ubyte0_e32 v9, v10 +v_fma_f32 v11, v11, v98, -1.0 +v_fma_f32 v11, v9, v98, v11 +v_madak_f32 v11, v11, v98, 0x9f000000 +v_mul_f32_e32 v11, 0x5f800000, v11 +v_mov_b32_e32 v9, 0 +v_cvt_flr_i32_f32_e64 v11, -v11 +v_lshl_add_u32 v98, v98, 9, v11 +v_mad_u64_u32 v[9:10], vcc, v10, v98, v[9:10] +v_subb_co_u32_e64 v98, vcc, v98, -1, vcc +v_mul_hi_u32 v9, v97, v98 +v_add_co_u32_e32 v98, vcc, v9, v97 +v_addc_co_u32_e64 v9, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v8 +v_cndmask_b32_e32 v98, v98, v9, vcc +v_alignbit_b32 v98, v9, v98, v8 +v_mad_i32_i24 v97, v98, s74, v97 +v_readlane_b32 s76, v96, 2 +v_readlane_b32 s77, v97, 2 +v_readlane_b32 s78, v98, 2 +v_readlane_b32 s79, v97, 3 +v_readlane_b32 s80, v98, 3 +v_add_co_u32_e64 v96, vcc, v96, s75 +v_add_co_u32_e64 v97, vcc, v97, s74 +v_mov_b32_dpp v98, v98 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v96, v96 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v97, v97 quad_perm:[1,1,0,0] row_mask:0xf bank_mask:0xf +s_mov_b32 s42, 0x80000000 +s_mov_b32 s43, 0x20000 +s_mov_b32 s46, 0x80000000 +s_mov_b32 s47, 0x20000 +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cbranch_vccnz 5 +v_xor_b32_dpp v100, v0, v0 quad_perm:[2,3,2,1] row_mask:0xf bank_mask:0xf +v_subrev_co_u32_e32 v100, vcc, 1, v100 +v_cvt_f32_i32_e32 v100, v100 +s_branch 4 +v_xor_b32_dpp v100, v0, v0 quad_perm:[2,1,0,1] row_mask:0xf bank_mask:0xf +v_sub_co_u32_e32 v100, vcc, 1, v100 +v_cvt_f32_i32_e32 v100, v100 +v_mov_b32_e32 v101, 1 +v_xor_b32_dpp v101, v0, v0 quad_perm:[2,3,2,3] row_mask:0xf bank_mask:0x4 +v_xor_b32_dpp v101, v0, v0 quad_perm:[0,1,0,1] row_mask:0xf bank_mask:0x8 +v_subrev_co_u32_e32 v101, vcc, 1, v101 +v_mov_b32_e32 v102, 1 +v_xor_b32_dpp v102, v0, v0 quad_perm:[0,3,2,1] row_mask:0xf bank_mask:0x2 +v_xor_b32_dpp v102, v0, v0 quad_perm:[2,1,0,3] row_mask:0xf bank_mask:0x4 +v_subrev_co_u32_e32 v102, vcc, 1, v102 +v_cvt_f32_i32_e32 v101, v101 +v_cvt_f32_i32_e32 v102, v102 +v_lshrrev_b32_e64 v106, 2, s92 +v_and_b32_e32 v107, 3, v0 +v_bfe_u32 v108, v0, 4, 3 +v_mad_u32_u24 v95, v108, 4, v107 +v_lshlrev_b32_e32 v95, 4, v95 +v_mad_u32_u24 v90, v106, 4, v107 +v_lshlrev_b32_e32 v90, 4, v90 +v_bfe_u32 v106, v0, 2, 2 +v_and_b32_e32 v107, 1, v106 +v_mad_u32_u24 v109, v106, 16, v107 +v_lshlrev_b32_e32 v109, 6, v109 +v_xor_b32_e32 v90, v90, v109 +v_mul_u32_u24_e32 v109, 0x400, v106 +v_xor_b32_e32 v95, v95, v109 +s_lshr_b32 s92, s92, 0 +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cbranch_vccnz 47 +s_and_b32 s53, s18, 0x1100000 +s_addc_u32 s53, 0, 0 +v_lshrrev_b32_e32 v109, 1, v0 +s_mul_i32 s52, 60, s53 +s_sub_u32 s52, 63, s52 +v_bfi_b32 v109, s52, v0, v109 +v_and_b32_e32 v106, 1, v109 +v_bfe_u32 v107, v109, 1, 1 +v_xor_b32_e32 v106, v106, v107 +v_bfe_u32 v108, v109, 3, 1 +v_mad_u32_u24 v107, v107, 2, v108 +v_mul_u32_u24_e32 v106, 0x118, v106 +v_bfe_u32 v108, v109, 2, 1 +v_mad_u32_u24 v107, v107, 2, v106 +v_xor_b32_e32 v107, v107, v108 +v_and_b32_e32 v108, 0xf0, v109 +v_xor_b32_e32 v107, v107, v108 +s_mul_i32 s52, 4, s53 +s_sub_u32 s52, 6, s52 +v_bfe_u32 v109, v0, s52, 1 +v_mul_u32_u24_e32 v109, 0x1040, v109 +v_xor_b32_e32 v92, 0x314, v107 +v_xor_b32_e32 v93, 0x31c, v107 +v_xor_b32_e32 v94, 8, v107 +v_mov_b32_e32 v91, v107 +v_mad_u32_u24 v91, 4, v91, v109 +v_mad_u32_u24 v92, 4, v92, v109 +v_mad_u32_u24 v93, 4, v93, v109 +v_mad_u32_u24 v94, 4, v94, v109 +s_branch 44 +s_bfe_u32 s53, s18, 0x10014 +v_lshrrev_b32_e32 v109, 1, v0 +s_mul_i32 s52, 60, s53 +s_sub_u32 s52, 63, s52 +v_bfi_b32 v109, s52, v0, v109 +v_and_b32_e32 v106, 1, v109 +v_bfe_u32 v107, v109, 1, 1 +v_bfe_u32 v108, v109, 3, 1 +v_xor_b32_e32 v106, v106, v107 +v_mad_u32_u24 v107, v107, 2, v108 +v_mul_u32_u24_e32 v106, 0x109, v106 +v_bfe_u32 v108, v109, 2, 1 +v_mad_u32_u24 v107, v107, 2, v106 +v_xor_b32_e32 v107, v107, v108 +v_and_b32_e32 v108, 0xf0, v109 +v_or_b32_e32 v107, v107, v108 +s_mul_i32 s52, 4, s53 +s_sub_u32 s52, 6, s52 +v_bfe_u32 v109, v0, s52, 1 +v_mul_u32_u24_e32 v109, 0x1040, v109 +v_mad_u32_u24 v91, 4, v107, v109 +v_xor_b32_e32 v92, 0x307, v107 +v_mad_u32_u24 v92, 4, v92, v109 +v_xor_b32_e32 v93, 0x30f, v107 +v_mad_u32_u24 v93, 4, v93, v109 +v_xor_b32_e32 v94, 8, v107 +v_mad_u32_u24 v94, 4, v94, v109 +v_subrev_co_u32_e32 v96, vcc, s76, v96 +v_mov_b32_e32 v107, s75 +v_cmp_lt_i32_e32 vcc, v96, v107 +v_subb_co_u32_e64 v106, vcc, 0, 0, vcc +v_mad_i32_i24 v96, v106, s75, v96 +v_mad_i32_i24 v98, v106, s80, v98 +v_mad_i32_i24 v97, v106, s79, v97 +v_mov_b32_e32 v107, s74 +v_cmp_lt_i32_e32 vcc, v97, v107 +v_subb_co_u32_e64 v106, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v106 +v_mad_i32_i24 v97, v106, v107, v97 +v_subrev_co_u32_e32 v97, vcc, s77, v97 +v_cmp_lt_i32_e32 vcc, v97, v107 +v_subb_co_u32_e64 v106, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v106 +v_mad_i32_i24 v97, v106, s74, v97 +v_subrev_co_u32_e32 v98, vcc, s78, v98 +s_mov_b32 s62, 0 +s_mov_b32 s63, s28 +s_mov_b32 s64, 1 +s_mov_b32 s84, 0 +s_mov_b32 s85, s16 +s_mov_b32 s83, s85 +s_sub_u32 s93, -1, s92 +s_sub_u32 s93, s93, 32 +s_bitset1_b32 s18, 21 +s_mov_b32 s47, 0 +s_mov_b32 s51, 0 +s_mov_b32 s94, 17 +s_mov_b32 s82, 0 +s_bitset1_b32 s18, 26 +s_call_b64 s[38:39], 1678 +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cbranch_vccnz 65 +s_branch 900 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v66, v68, v66 div:2 +v_subrev_f32_e64 v69, v67, v69 div:2 +v_add_f32_e64 v67, v68, v67 div:2 +v_mad_f32 v68, v68, 1.0, -v67 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:29440 +ds_read_b128 v[42:45], v90 offset:28928 +ds_read_b128 v[46:49], v90 offset:29056 +ds_write_b32 v91, v62 +ds_write_b32 v92, v63 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v58, v82, s[40:43], 0 offen +buffer_load_dword v60, v84, s[40:43], 0 offen +buffer_load_dword v59, v83, s[40:43], 0 offen +buffer_load_dword v61, v85, s[40:43], 0 offen +s_add_u32 s91, s91, 0x200 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 1536 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_dpp v66, v66, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v67, v67, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v68, v68, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v69, v69, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:33536 +ds_read_b128 v[50:53], v90 offset:33024 +ds_read_b128 v[54:57], v90 offset:33152 +ds_write_b32 v93, v68 offset:8256 +ds_write_b32 v94, v69 offset:8256 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +ds_append v105 offset:65472 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 1474 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v70, v72, v70 div:2 +v_subrev_f32_e64 v73, v71, v73 div:2 +v_add_f32_e64 v71, v72, v71 div:2 +v_mad_f32 v72, v72, 1.0, -v71 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:37696 +ds_read_b128 v[42:45], v90 offset:37184 +ds_read_b128 v[46:49], v90 offset:37312 +ds_write_b32 v91, v66 offset:8256 +ds_write_b32 v92, v67 offset:8256 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v62, v82, s[40:43], 0 offen +buffer_load_dword v64, v84, s[40:43], 0 offen +buffer_load_dword v63, v83, s[40:43], 0 offen +buffer_load_dword v65, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc0 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 1400 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_dpp v70, v70, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v71, v71, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v72, v72, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v73, v73, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:41792 +ds_read_b128 v[50:53], v90 offset:41280 +ds_read_b128 v[54:57], v90 offset:41408 +ds_write_b32 v93, v72 offset:16512 +ds_write_b32 v94, v73 offset:16512 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 8 +s_call_b64 s[38:39], 1335 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v74, v76, v74 div:2 +v_subrev_f32_e64 v77, v75, v77 div:2 +v_add_f32_e64 v75, v76, v75 div:2 +v_mad_f32 v76, v76, 1.0, -v75 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:45952 +ds_read_b128 v[42:45], v90 offset:45440 +ds_read_b128 v[46:49], v90 offset:45568 +ds_write_b32 v91, v70 offset:16512 +ds_write_b32 v92, v71 offset:16512 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v66, v82, s[40:43], 0 offen +buffer_load_dword v68, v84, s[40:43], 0 offen +buffer_load_dword v67, v83, s[40:43], 0 offen +buffer_load_dword v69, v85, s[40:43], 0 offen +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 1258 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_dpp v74, v74, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v75, v75, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v76, v76, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v77, v77, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:512 +ds_read_b128 v[50:53], v90 +ds_read_b128 v[54:57], v90 offset:128 +ds_write_b32 v93, v76 offset:24768 +ds_write_b32 v94, v77 offset:24768 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +ds_append v105 offset:65476 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 1194 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v78, v80, v78 div:2 +v_subrev_f32_e64 v81, v79, v81 div:2 +v_add_f32_e64 v79, v80, v79 div:2 +v_mad_f32 v80, v80, 1.0, -v79 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:4672 +ds_read_b128 v[42:45], v90 offset:4160 +ds_read_b128 v[46:49], v90 offset:4288 +ds_write_b32 v91, v74 offset:24768 +ds_write_b32 v92, v75 offset:24768 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v70, v82, s[40:43], 0 offen +buffer_load_dword v72, v84, s[40:43], 0 offen +buffer_load_dword v71, v83, s[40:43], 0 offen +buffer_load_dword v73, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc4 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 1120 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_dpp v78, v78, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v79, v79, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v80, v80, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v81, v81, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:8768 +ds_read_b128 v[50:53], v90 offset:8256 +ds_read_b128 v[54:57], v90 offset:8384 +ds_write_b32 v93, v80 offset:33024 +ds_write_b32 v94, v81 offset:33024 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 8 +s_call_b64 s[38:39], 1055 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v58, v60, v58 div:2 +v_subrev_f32_e64 v61, v59, v61 div:2 +v_add_f32_e64 v59, v60, v59 div:2 +v_mad_f32 v60, v60, 1.0, -v59 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:12928 +ds_read_b128 v[42:45], v90 offset:12416 +ds_read_b128 v[46:49], v90 offset:12544 +ds_write_b32 v91, v78 offset:33024 +ds_write_b32 v92, v79 offset:33024 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v74, v82, s[40:43], 0 offen +buffer_load_dword v76, v84, s[40:43], 0 offen +buffer_load_dword v75, v83, s[40:43], 0 offen +buffer_load_dword v77, v85, s[40:43], 0 offen +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 978 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_dpp v58, v58, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v59, v59, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v60, v60, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v61, v61, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:17024 +ds_read_b128 v[50:53], v90 offset:16512 +ds_read_b128 v[54:57], v90 offset:16640 +ds_write_b32 v93, v60 offset:41280 +ds_write_b32 v94, v61 offset:41280 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +ds_append v105 offset:65480 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 914 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_subrev_f32_e64 v62, v64, v62 div:2 +v_subrev_f32_e64 v65, v63, v65 div:2 +v_add_f32_e64 v63, v64, v63 div:2 +v_mad_f32 v64, v64, 1.0, -v63 +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:21184 +ds_read_b128 v[42:45], v90 offset:20672 +ds_read_b128 v[46:49], v90 offset:20800 +ds_write_b32 v91, v58 offset:41280 +ds_write_b32 v92, v59 offset:41280 +s_setprio 1 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v78, v82, s[40:43], 0 offen +buffer_load_dword v80, v84, s[40:43], 0 offen +buffer_load_dword v79, v83, s[40:43], 0 offen +buffer_load_dword v81, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc8 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 840 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_dpp v62, v62, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v63, v63, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v64, v64, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v65, v65, v100 quad_perm:[2,2,1,1] row_mask:0xf bank_mask:0xf +s_setprio 0 +s_nop 0 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:25280 +ds_read_b128 v[50:53], v90 offset:24768 +ds_read_b128 v[54:57], v90 offset:24896 +ds_write_b32 v93, v64 +ds_write_b32 v94, v65 +s_setprio 1 +s_nop 0 +s_waitcnt vmcnt(12) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 64704 +s_call_b64 s[38:39], 775 +s_branch 64702 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v66, v66, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v69, v69, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:29440 +ds_read_b128 v[42:45], v90 offset:28928 +ds_read_b128 v[46:49], v90 offset:29056 +ds_write_b32 v91, v62 +ds_write_b32 v92, v63 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v58, v82, s[40:43], 0 offen +buffer_load_dword v61, v85, s[40:43], 0 offen +s_add_u32 s91, s91, 0x200 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 704 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_add_f32_e64 v67, v66, v69 div:2 +v_add_f32_e64 v68, v66, -v69 div:2 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:33536 +ds_read_b128 v[50:53], v90 offset:33024 +ds_read_b128 v[54:57], v90 offset:33152 +ds_write_b32 v93, v68 offset:8256 +ds_write_b32 v94, v69 offset:8256 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +ds_append v105 offset:65472 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 7 +s_call_b64 s[38:39], 646 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v70, v70, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v73, v73, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:37696 +ds_read_b128 v[42:45], v90 offset:37184 +ds_read_b128 v[46:49], v90 offset:37312 +ds_write_b32 v91, v66 offset:8256 +ds_write_b32 v92, v67 offset:8256 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v62, v82, s[40:43], 0 offen +buffer_load_dword v65, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc0 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 576 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_add_f32_e64 v71, v70, v73 div:2 +v_add_f32_e64 v72, v70, -v73 div:2 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:41792 +ds_read_b128 v[50:53], v90 offset:41280 +ds_read_b128 v[54:57], v90 offset:41408 +ds_write_b32 v93, v72 offset:16512 +ds_write_b32 v94, v73 offset:16512 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 4 +s_call_b64 s[38:39], 515 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v74, v74, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v77, v77, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:45952 +ds_read_b128 v[42:45], v90 offset:45440 +ds_read_b128 v[46:49], v90 offset:45568 +ds_write_b32 v91, v70 offset:16512 +ds_write_b32 v92, v71 offset:16512 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v66, v82, s[40:43], 0 offen +buffer_load_dword v69, v85, s[40:43], 0 offen +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 450 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_add_f32_e64 v75, v74, v77 div:2 +v_add_f32_e64 v76, v74, -v77 div:2 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:512 +ds_read_b128 v[50:53], v90 +ds_read_b128 v[54:57], v90 offset:128 +ds_write_b32 v93, v76 offset:24768 +ds_write_b32 v94, v77 offset:24768 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +ds_append v105 offset:65476 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 7 +s_call_b64 s[38:39], 390 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v78, v78, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v81, v81, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:4672 +ds_read_b128 v[42:45], v90 offset:4160 +ds_read_b128 v[46:49], v90 offset:4288 +ds_write_b32 v91, v74 offset:24768 +ds_write_b32 v92, v75 offset:24768 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v70, v82, s[40:43], 0 offen +buffer_load_dword v73, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc4 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 320 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_add_f32_e64 v79, v78, v81 div:2 +v_add_f32_e64 v80, v78, -v81 div:2 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:8768 +ds_read_b128 v[50:53], v90 offset:8256 +ds_read_b128 v[54:57], v90 offset:8384 +ds_write_b32 v93, v80 offset:33024 +ds_write_b32 v94, v81 offset:33024 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 4 +s_call_b64 s[38:39], 259 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v58, v58, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v61, v61, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:12928 +ds_read_b128 v[42:45], v90 offset:12416 +ds_read_b128 v[46:49], v90 offset:12544 +ds_write_b32 v91, v78 offset:33024 +ds_write_b32 v92, v79 offset:33024 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v74, v82, s[40:43], 0 offen +buffer_load_dword v77, v85, s[40:43], 0 offen +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 3 +s_call_b64 s[38:39], 194 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_add_f32_e64 v59, v58, v61 div:2 +v_add_f32_e64 v60, v58, -v61 div:2 +v_mac_f32_e32 v32, v40, v57 +v_mac_f32_e32 v33, v41, v57 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:17024 +ds_read_b128 v[50:53], v90 offset:16512 +ds_read_b128 v[54:57], v90 offset:16640 +ds_write_b32 v93, v60 offset:41280 +ds_write_b32 v94, v61 offset:41280 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +ds_append v105 offset:65480 +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 7 +s_call_b64 s[38:39], 134 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +v_mac_f32_e32 v2, v34, v42 +v_mac_f32_e32 v3, v35, v42 +v_mac_f32_e32 v4, v36, v42 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v37, v42 +v_mac_f32_e32 v6, v34, v43 +v_mac_f32_e32 v7, v35, v43 +v_mac_f32_e32 v8, v36, v43 +v_mac_f32_e32 v9, v37, v43 +v_mac_f32_e32 v10, v34, v44 +v_mac_f32_e32 v11, v35, v44 +v_mac_f32_e32 v12, v36, v44 +v_mac_f32_e32 v13, v37, v44 +v_mac_f32_e32 v14, v34, v45 +v_mac_f32_e32 v15, v35, v45 +v_mac_f32_e32 v16, v36, v45 +v_mac_f32_e32 v17, v37, v45 +v_mac_f32_e32 v18, v34, v46 +v_mac_f32_e32 v19, v35, v46 +v_mac_f32_e32 v20, v36, v46 +v_mac_f32_e32 v21, v37, v46 +v_mac_f32_e32 v22, v34, v47 +v_mac_f32_e32 v23, v35, v47 +v_mac_f32_e32 v24, v36, v47 +v_mac_f32_e32 v25, v37, v47 +v_mac_f32_e32 v26, v34, v48 +v_mac_f32_e32 v27, v35, v48 +v_mac_f32_e32 v28, v36, v48 +v_mac_f32_e32 v29, v37, v48 +v_mac_f32_e32 v30, v34, v49 +v_mac_f32_e32 v31, v35, v49 +v_mac_f32_dpp v62, v62, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_dpp v65, v65, v100 quad_perm:[0,0,1,1] row_mask:0xf bank_mask:0xf +v_mac_f32_e32 v32, v36, v49 +v_mac_f32_e32 v33, v37, v49 +s_nop 0 +ds_read_b128 v[34:37], v95 offset:21184 +ds_read_b128 v[42:45], v90 offset:20672 +ds_read_b128 v[46:49], v90 offset:20800 +ds_write_b32 v91, v58 offset:41280 +ds_write_b32 v92, v59 offset:41280 +s_setprio 0 +s_add_u32 s40, s40, s70 +s_addc_u32 s41, s41, s71 +buffer_load_dword v78, v82, s[40:43], 0 offen +buffer_load_dword v81, v85, s[40:43], 0 offen +s_mov_b32 m0, 0x2ffc8 +s_nop 0 +s_waitcnt lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 1 +s_call_b64 s[38:39], 64 +v_mac_f32_e32 v2, v38, v50 +v_mac_f32_e32 v3, v39, v50 +v_mac_f32_e32 v4, v40, v50 +s_setprio 1 +s_nop 0 +v_mac_f32_e32 v5, v41, v50 +v_mac_f32_e32 v6, v38, v51 +v_mac_f32_e32 v7, v39, v51 +v_mac_f32_e32 v8, v40, v51 +v_mac_f32_e32 v9, v41, v51 +v_mac_f32_e32 v10, v38, v52 +v_mac_f32_e32 v11, v39, v52 +v_mac_f32_e32 v12, v40, v52 +v_mac_f32_e32 v13, v41, v52 +v_mac_f32_e32 v14, v38, v53 +v_mac_f32_e32 v15, v39, v53 +v_mac_f32_e32 v16, v40, v53 +v_mac_f32_e32 v17, v41, v53 +v_mac_f32_e32 v18, v38, v54 +v_mac_f32_e32 v19, v39, v54 +v_mac_f32_e32 v20, v40, v54 +v_mac_f32_e32 v21, v41, v54 +v_mac_f32_e32 v22, v38, v55 +v_mac_f32_e32 v23, v39, v55 +v_mac_f32_e32 v24, v40, v55 +v_mac_f32_e32 v25, v41, v55 +v_mac_f32_e32 v26, v38, v56 +v_mac_f32_e32 v27, v39, v56 +v_mac_f32_e32 v28, v40, v56 +v_mac_f32_e32 v29, v41, v56 +v_mac_f32_e32 v30, v38, v57 +v_mac_f32_e32 v31, v39, v57 +v_mac_f32_e32 v32, v40, v57 +v_add_f32_e64 v63, v62, v65 div:2 +v_add_f32_e64 v64, v62, -v65 div:2 +v_mac_f32_e32 v33, v41, v57 +v_cmp_eq_u32_e64 vcc, src_lds_direct, s91 +s_nop 0 +s_nop 0 +s_cbranch_vccz 65531 +s_nop 0 +ds_read_b128 v[38:41], v95 offset:25280 +ds_read_b128 v[50:53], v90 offset:24768 +ds_read_b128 v[54:57], v90 offset:24896 +ds_write_b32 v93, v64 +ds_write_b32 v94, v65 +s_setprio 0 +s_nop 0 +s_waitcnt vmcnt(6) lgkmcnt(5) +s_bitset0_b32 s18, 26 +s_add_u32 s72, s72, -1 +s_cbranch_scc1 64772 +s_call_b64 s[38:39], 3 +s_branch 64770 +s_nop 0 +s_nop 0 +v_nop +s_cmp_eq_u32 s82, 0 +s_cbranch_scc0 8 +s_branch 588 +s_add_u32 s82, s82, 1 +s_andn2_b32 s82, s82, 1 +s_bitcmp1_b32 0, 26 +s_cselect_b32 s52, s69, s70 +s_cselect_b32 s53, 0, s71 +s_sub_u32 s40, s40, s52 +s_subb_u32 s41, s41, s53 +s_cmp_eq_u32 s94, 0 +s_cbranch_scc0 3 +s_cbranch_scc1 610 +s_nop 0 +s_nop 0 +s_min_u32 s72, s82, s94 +s_sub_u32 s82, s82, s72 +s_sub_u32 s94, s94, s72 +s_sub_u32 s72, s72, 1 +s_setpc_b64 s[38:39] +s_nop 0 +s_nop 0 +s_nop 0 +s_bitcmp1_b32 s18, 17 +s_cbranch_scc1 241 +s_add_u32 s88, s88, s17 +s_cmp_eq_u32 s88, 0 +s_cbranch_scc1 238 +s_mov_b32 s89, 0 +s_bitcmp1_b32 s18, 16 +s_cbranch_scc1 227 +s_add_u32 s87, s16, 31 +s_lshr_b32 s87, s87, 5 +v_mov_b32_e32 v107, s88 +v_mul_u32_u24_e32 v107, s87, v107 +v_add_co_u32_e32 v107, vcc, s17, v107 +v_sub_co_u32_e64 v107, vcc, v107, 1 +v_ffbh_u32_e32 v110, s17 +v_lshlrev_b32_e64 v111, v110, s17 +v_and_b32_e32 v112, 0xffffff00, v111 +v_cmp_eq_u32_e32 vcc, 0x80000000, v111 +v_cvt_f32_u32_e32 v112, v112 +v_rcp_f32_e32 v106, v112 +v_subb_co_u32_e32 v109, vcc, 32, v110, vcc +v_cvt_f32_ubyte0_e32 v110, v111 +v_fma_f32 v112, v112, v106, -1.0 +v_fma_f32 v112, v110, v106, v112 +v_madak_f32 v112, v112, v106, 0x9f000000 +v_mul_f32_e32 v112, 0x5f800000, v112 +v_mov_b32_e32 v110, 0 +v_cvt_flr_i32_f32_e64 v112, -v112 +v_lshl_add_u32 v106, v106, 9, v112 +v_mad_u64_u32 v[110:111], vcc, v111, v106, v[110:111] +v_subb_co_u32_e64 v106, vcc, v106, -1, vcc +v_mul_hi_u32 v110, v107, v106 +v_add_co_u32_e32 v106, vcc, v110, v107 +v_addc_co_u32_e64 v110, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v109 +v_cndmask_b32_e32 v106, v106, v110, vcc +v_alignbit_b32 v106, v110, v106, v109 +v_readfirstlane_b32 s86, v106 +v_mul_u32_u24_e64 v106, v106, s8 +v_ffbh_u32_e32 v110, s87 +v_lshlrev_b32_e64 v111, v110, s87 +v_and_b32_e32 v112, 0xffffff00, v111 +v_cmp_eq_u32_e32 vcc, 0x80000000, v111 +v_cvt_f32_u32_e32 v112, v112 +v_rcp_f32_e32 v107, v112 +v_subb_co_u32_e32 v109, vcc, 32, v110, vcc +v_cvt_f32_ubyte0_e32 v110, v111 +v_fma_f32 v112, v112, v107, -1.0 +v_fma_f32 v112, v110, v107, v112 +v_madak_f32 v112, v112, v107, 0x9f000000 +v_mul_f32_e32 v112, 0x5f800000, v112 +v_mov_b32_e32 v110, 0 +v_cvt_flr_i32_f32_e64 v112, -v112 +v_lshl_add_u32 v107, v107, 9, v112 +v_mad_u64_u32 v[110:111], vcc, v111, v107, v[110:111] +v_subb_co_u32_e64 v107, vcc, v107, -1, vcc +v_mul_hi_u32 v110, v106, v107 +v_add_co_u32_e32 v107, vcc, v110, v106 +v_addc_co_u32_e64 v110, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v109 +v_cndmask_b32_e32 v107, v107, v110, vcc +v_alignbit_b32 v107, v110, v107, v109 +v_readfirstlane_b32 s52, v106 +v_readfirstlane_b32 s84, v107 +s_mul_i32 s84, s84, s87 +s_sub_u32 s84, s52, s84 +v_sub_co_u32_e32 v107, vcc, s8, v107 +v_sub_co_u32_e32 v107, vcc, s17, v107 +v_and_b32_e64 v109, v0, 63 +v_cmp_eq_u32_e64 vcc, v109, 0 +v_cndmask_b32_e32 v107, 1, v107, vcc +s_sub_u32 s58, 0, s75 +s_sub_u32 s59, 0, s74 +v_mul_u32_u24_e64 v111, v107, 32 +v_ffbh_u32_e32 v113, s58 +v_lshlrev_b32_e64 v114, v113, s58 +v_and_b32_e32 v115, 0xffffff00, v114 +v_cmp_eq_u32_e32 vcc, 0x80000000, v114 +v_cvt_f32_u32_e32 v115, v115 +v_rcp_f32_e32 v109, v115 +v_subb_co_u32_e32 v112, vcc, 32, v113, vcc +v_cvt_f32_ubyte0_e32 v113, v114 +v_fma_f32 v115, v115, v109, -1.0 +v_fma_f32 v115, v113, v109, v115 +v_madak_f32 v115, v115, v109, 0x9f000000 +v_mul_f32_e32 v115, 0x5f800000, v115 +v_mov_b32_e32 v113, 0 +v_cvt_flr_i32_f32_e64 v115, -v115 +v_lshl_add_u32 v109, v109, 9, v115 +v_mad_u64_u32 v[113:114], vcc, v114, v109, v[113:114] +v_subb_co_u32_e64 v109, vcc, v109, -1, vcc +v_mul_hi_u32 v113, v111, v109 +v_add_co_u32_e32 v109, vcc, v113, v111 +v_addc_co_u32_e64 v113, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v112 +v_cndmask_b32_e32 v109, v109, v113, vcc +v_alignbit_b32 v109, v113, v109, v112 +v_mad_i32_i24 v110, v109, s75, v111 +v_mul_u32_u24_e64 v111, v109, 1 +v_ffbh_u32_e32 v113, s59 +v_lshlrev_b32_e64 v114, v113, s59 +v_and_b32_e32 v115, 0xffffff00, v114 +v_cmp_eq_u32_e32 vcc, 0x80000000, v114 +v_cvt_f32_u32_e32 v115, v115 +v_rcp_f32_e32 v109, v115 +v_subb_co_u32_e32 v112, vcc, 32, v113, vcc +v_cvt_f32_ubyte0_e32 v113, v114 +v_fma_f32 v115, v115, v109, -1.0 +v_fma_f32 v115, v113, v109, v115 +v_madak_f32 v115, v115, v109, 0x9f000000 +v_mul_f32_e32 v115, 0x5f800000, v115 +v_mov_b32_e32 v113, 0 +v_cvt_flr_i32_f32_e64 v115, -v115 +v_lshl_add_u32 v109, v109, 9, v115 +v_mad_u64_u32 v[113:114], vcc, v114, v109, v[113:114] +v_subb_co_u32_e64 v109, vcc, v109, -1, vcc +v_mul_hi_u32 v113, v111, v109 +v_add_co_u32_e32 v109, vcc, v113, v111 +v_addc_co_u32_e64 v113, vcc, 0, 0, vcc +v_cmp_eq_u32_e32 vcc, 32, v112 +v_cndmask_b32_e32 v109, v109, v113, vcc +v_alignbit_b32 v109, v113, v109, v112 +v_mad_i32_i24 v111, v109, s74, v111 +v_readfirstlane_b32 s76, v110 +v_readfirstlane_b32 s77, v111 +v_readfirstlane_b32 s78, v109 +v_add_co_u32_e32 v96, vcc, s76, v96 +v_addc_co_u32_e64 v112, vcc, 0, 0, vcc +v_mad_i32_i24 v96, v112, s75, v96 +v_mad_i32_i24 v98, v112, s80, v98 +v_mad_i32_i24 v97, v112, s79, v97 +v_cmp_ge_i32_e64 vcc, v97, 0 +v_addc_co_u32_e64 v112, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v112 +v_mad_i32_i24 v97, v112, s74, v97 +v_add_co_u32_e32 v97, vcc, s77, v97 +v_addc_co_u32_e64 v112, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v112 +v_mad_i32_i24 v97, v112, s74, v97 +v_add_co_u32_e32 v98, vcc, s78, v98 +v_readlane_b32 s76, v110, 1 +v_readlane_b32 s77, v111, 1 +v_readlane_b32 s78, v109, 1 +s_add_u32 s85, s84, s86 +s_cmp_le_u32 s85, s87 +s_cselect_b32 s52, 0x20000, 0 +s_cselect_b32 s85, s85, s87 +s_or_b32 s18, s18, s52 +s_lshl_b32 s84, s84, 5 +s_lshl_b32 s85, s85, 5 +s_min_u32 s85, s85, s16 +s_cmp_eq_u32 s8, s17 +s_cselect_b32 s52, 0x20000, 0 +s_or_b32 s18, s18, s52 +s_or_b32 s18, s18, s52 +s_bitset1_b32 s18, 16 +s_branch 43 +s_lshr_b32 s84, s84, 5 +s_add_u32 s85, s84, s86 +s_sub_u32 s85, s85, s87 +s_mov_b32 s84, 0 +s_lshl_b32 s85, s85, 5 +s_min_u32 s85, s85, s16 +s_bitset1_b32 s18, 17 +s_branch 12 +s_bitset1_b32 s18, 18 +s_mov_b32 s43, 0 +s_mov_b32 s73, -1 +s_mov_b32 s82, 40 +s_branch 31 +s_add_u32 s83, s83, 32 +s_cmp_ge_u32 s83, s85 +s_cbranch_scc0 28 +s_bitset1_b32 s18, 22 +s_sub_u32 s88, s88, s17 +s_subb_u32 s89, s89, 0 +s_cbranch_scc1 65281 +v_add_co_u32_e32 v96, vcc, s76, v96 +v_addc_co_u32_e64 v106, vcc, 0, 0, vcc +v_mad_i32_i24 v96, v106, s75, v96 +v_mad_i32_i24 v98, v106, s80, v98 +v_mad_i32_i24 v97, v106, s79, v97 +v_cmp_ge_i32_e64 vcc, v97, 0 +v_addc_co_u32_e64 v106, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v106 +v_mad_i32_i24 v97, v106, s74, v97 +v_add_co_u32_e32 v97, vcc, s77, v97 +v_addc_co_u32_e64 v106, vcc, 0, 0, vcc +v_add_co_u32_e32 v98, vcc, v98, v106 +v_mad_i32_i24 v97, v106, s74, v97 +v_add_co_u32_e32 v98, vcc, s78, v98 +s_mov_b32 s83, s84 +v_cmp_le_u32_e32 vcc, 0x100, v0 +s_cbranch_vccz 166 +v_subrev_co_u32_e32 v106, vcc, s75, v96 +v_subrev_co_u32_e32 v107, vcc, s74, v97 +s_bitcmp1_b32 s18, 22 +s_cbranch_scc0 64 +s_bitset0_b32 s18, 22 +s_bfe_u32 s52, s18, 0x10014 +v_mul_u32_u24_e32 v111, 3, v106 +v_mul_u32_u24_e32 v112, 3, v107 +v_cvt_pk_u16_u32 v114, v111, v112 +v_and_b32_e64 v111, v0, 1 +v_cmp_eq_u32_e64 vcc, v111, 1 +v_cndmask_b32_e32 v114, v98, v114, vcc +v_lshrrev_b32_e32 v110, 1, v0 +v_bfe_u32 v115, v110, s52, 1 +v_lshrrev_b32_e32 v110, 1, v0 +v_bfi_b32 v110, 1, v0, v110 +v_lshrrev_b32_e32 v111, 2, v0 +v_bfi_b32 v111, 1, v0, v111 +v_cmp_eq_u32_e64 vcc, s52, 0 +v_cndmask_b32_e32 v110, v111, v110, vcc +s_sub_u32 s52, 1, s52 +v_lshrrev_b32_e32 v111, s52, v110 +v_bfi_b32 v110, 32, v111, v110 +v_and_b32_e32 v110, 63, v110 +v_add_co_u32_e32 v111, vcc, 16, v110 +v_and_b32_e64 v112, v0, 2 +v_cmp_eq_u32_e64 vcc, v112, 0 +v_cndmask_b32_e32 v111, v111, v110, vcc +v_lshlrev_b32_e32 v112, 14, v115 +v_mad_u32_u24 v111, 4, v111, v112 +v_add_co_u32_e32 v110, vcc, s96, v111 +ds_write_b32 v110, v114 +v_writelane_b32 v112, s18, 0 +v_writelane_b32 v112, s85, 1 +v_writelane_b32 v112, s84, 2 +v_and_b32_e64 v110, v0, 63 +v_cmp_ge_u32_e64 vcc, v110, 3 +v_mov_b32_e32 v113, 0x4000 +v_cndmask_b32_e32 v110, v110, v113, vcc +v_mad_i32_i24 v110, v110, 4, s96 +ds_write_b32 v110, v112 offset:256 +s_add_u32 s96, s96, 0x18c +s_cmp_eq_u32 s96, 0xffc0 +s_cselect_b32 s96, 0xc1e0, s96 +v_mov_b32_dpp v108, v98 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v106, v106 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v107, v107 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_readfirstlane_b32 s81, v108 +v_sub_co_u32_e64 v109, vcc, v108, s81 +v_mul_lo_u32 v109, v109, s65 +v_and_b32_e64 v113, v0, 3 +v_ashrrev_i32_e64 v114, 0, s31 +v_subrev_co_u32_e32 v113, vcc, v114, v113 +v_ashrrev_i32_e64 v114, 0, s62 +v_mad_i32_i24 v110, v114, 2, v113 +s_bfe_u32 s52, s18, 0x10014 +v_lshrrev_b32_e32 v112, 2, v0 +v_and_b32_e32 v112, s52, v112 +v_mad_i32_i24 v110, v112, 2, v110 +v_add_co_u32_e64 v111, vcc, 0, s63 +v_ashrrev_i32_e32 v111, 0, v111 +v_add_co_u32_e64 v112, vcc, 0, s30 +v_ashrrev_i32_e32 v112, 0, v112 +v_sub_i32 v111, v111, v112 +s_lshl_b32 s54, s15, 2 +v_cmp_ge_u32_e64 s[52:53], v108, s12 +v_mad_i32_i24 v106, v106, 3, v110 +v_cmp_ge_u32_e64 s[56:57], v106, s15 +v_mad_i32_i24 v106, 4, v106, v109 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_mad_i32_i24 v107, v107, 3, v111 +v_cmp_ge_u32_e64 s[58:59], v107, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v82, v107, s54, v106 +v_cndmask_b32_e64 v82, v82, -1, s[58:59] +v_add_co_u32_e32 v107, vcc, 1, v107 +v_cmp_ge_u32_e64 s[58:59], v107, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v83, v107, s54, v106 +v_cndmask_b32_e64 v83, v83, -1, s[58:59] +v_add_co_u32_e32 v107, vcc, 1, v107 +v_cmp_ge_u32_e64 s[58:59], v107, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v84, v107, s54, v106 +v_cndmask_b32_e64 v84, v84, -1, s[58:59] +v_add_co_u32_e32 v107, vcc, 1, v107 +v_cmp_ge_u32_e64 s[58:59], v107, s14 +s_or_b64 s[58:59], s[56:57], s[58:59] +v_mad_u32_u24 v85, v107, s54, v106 +v_cndmask_b32_e64 v85, v85, -1, s[58:59] +s_bitcmp1_b32 s18, 18 +s_cbranch_scc1 138 +s_lshr_b32 s52, -1, 16 +s_and_b32 s52, s52, s65 +s_lshr_b32 s53, s65, 16 +s_mul_i32 s53, s53, s81 +s_mul_i32 s40, s52, s81 +s_lshl_b32 s52, s53, 16 +s_lshr_b32 s53, s53, 16 +s_add_u32 s40, s52, s40 +s_addc_u32 s41, s53, 0 +s_add_u32 s40, s40, s20 +s_addc_u32 s41, s41, s21 +s_lshr_b32 s52, s18, 6 +s_xor_b32 s52, s52, s18 +s_and_b32 s52, s52, 0x80000 +s_cselect_b32 s52, s68, 0 +s_add_u32 s40, s40, s52 +s_addc_u32 s41, s41, 0 +s_branch 95 +s_bitcmp1_b32 s18, 18 +s_cbranch_scc1 117 +s_bfe_u32 s52, s18, 0x10014 +v_xor_b32_dpp v106, v0, v0 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xf +v_bfe_u32 v108, v0, 2, s52 +v_mad_u32_u24 v106, v108, 2, v106 +v_mad_u32_u24 v106, s62, 2, v106 +v_sub_co_u32_e32 v108, vcc, s29, v106 +v_sub_co_u32_e64 v108, vcc, v108, 1 +s_bfe_u32 s54, s18, 0x10001 +v_cmp_eq_u32_e64 vcc, s54, 1 +v_cndmask_b32_e32 v106, v106, v108, vcc +v_cmp_ge_u32_e64 s[52:53], v106, s29 +v_lshlrev_b32_e32 v106, 2, v106 +s_bfe_u32 s54, s18, 0x10018 +v_bfe_u32 v109, v0, 2, s54 +v_mul_lo_u32 v109, s68, v109 +v_add_co_u32_e32 v106, vcc, v106, v109 +v_mul_lo_u32 v107, s90, v99 +v_add_co_u32_e32 v107, vcc, v107, v106 +s_sub_u32 s54, s28, s63 +s_sub_u32 s54, s54, 2 +s_bitcmp1_b32 s18, 0 +s_cselect_b32 s54, s54, s63 +v_mov_b32_e32 v109, s54 +s_lshl_b32 s57, s29, 2 +v_cmp_ge_u32_e64 s[54:55], v109, s28 +v_mad_i32_i24 v82, v109, s57, v107 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v82, v82, -1, s[54:55] +v_mov_b32_e32 v83, v82 +v_add_co_u32_e64 v109, vcc, v109, 1 +v_cmp_ge_u32_e64 s[54:55], v109, s28 +v_mad_i32_i24 v85, v109, s57, v107 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v85, v85, -1, s[54:55] +v_add_co_u32_e64 v109, vcc, v109, 1 +v_cmp_ge_u32_e64 s[54:55], v109, s28 +v_mad_i32_i24 v84, v109, s57, v107 +s_or_b64 s[54:55], s[54:55], s[52:53] +v_cndmask_b32_e64 v84, v84, -1, s[54:55] +s_bitcmp1_b32 s18, 0 +s_cselect_b64 vcc, -1, 0 +v_cndmask_b32_e32 v82, v83, v85, vcc +v_cndmask_b32_e32 v85, v85, v83, vcc +v_add_co_u32_e64 v106, vcc, v99, s83 +v_cmp_lt_u32_e64 vcc, v106, s16 +v_cndmask_b32_e32 v82, -1, v82, vcc +v_cndmask_b32_e32 v83, -1, v83, vcc +v_cndmask_b32_e32 v84, -1, v84, vcc +v_cndmask_b32_e32 v85, -1, v85, vcc +s_lshr_b32 s52, -1, 16 +s_and_b32 s52, s52, s90 +s_lshr_b32 s53, s90, 16 +s_mul_i32 s53, s53, s83 +s_mul_i32 s40, s52, s83 +s_lshl_b32 s52, s53, 16 +s_lshr_b32 s53, s53, 16 +s_add_u32 s40, s52, s40 +s_addc_u32 s41, s53, 0 +s_add_u32 s40, s40, s22 +s_addc_u32 s41, s41, s23 +s_lshr_b32 s52, s18, 6 +s_xor_b32 s52, s52, s18 +s_and_b32 s52, s52, 0x80000 +s_cselect_b32 s52, s68, 0 +s_add_u32 s40, s40, s52 +s_addc_u32 s41, s41, 0 +s_mov_b32 s43, 0x20000 +s_mov_b32 s73, -1 +s_bfe_u32 s52, s18, 0x10014 +s_lshl_b32 s82, s13, s52 +s_bfe_u32 s52, s18, 0x10013 +s_bfe_u32 s54, s18, 0x10019 +s_xor_b32 s52, s52, s54 +s_cselect_b32 s52, 1, 0 +s_cselect_b32 s43, 0x20000, s43 +s_and_b32 s52, s52, s82 +s_sub_u32 s82, s82, s52 +s_bitcmp1_b32 s18, 20 +s_cselect_b32 s52, 0, 0x2000000 +s_bitcmp1_b32 s13, 0 +s_cselect_b32 s52, s52, 0 +s_xor_b32 s18, s18, s52 +s_cmp_eq_u32 s82, 0 +s_cbranch_scc1 1 +s_branch 64948 +s_and_b32 s52, 0x900000, s18 +s_subb_u32 s62, s62, 1 +s_cbranch_scc0 65243 +s_and_b32 s52, 0x900000, s18 +s_subb_u32 s62, s61, 1 +s_add_u32 s63, s63, 2 +s_cmp_ge_u32 s63, s28 +s_cbranch_scc0 65237 +s_mov_b32 s63, 0 +s_branch 65204 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_mov_b32 s52, 0x3c3c3c3c +s_mov_b32 s53, s52 +v_mov_b32_e32 v107, v3 +v_mov_b32_e32 v108, v4 +v_mov_b32_e32 v109, v5 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v2 +v_add_f32_dpp v106, v2, v2 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v3, v3 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v4, v4 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v5, v5 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v4, v4, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v5, v5, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v2, v2, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v3, v3, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v3, v4 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v2, v5 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v3, v3, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v2, v2 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v2, v2, v2 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v5, v107 +v_add_f32_dpp v5, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v4, v106 +v_add_f32_dpp v4, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v3, v3, v2 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v2, v5, v4 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v4, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v4, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v3, v108, v3, s[52:53] +v_mov_b32_dpp v4, v4 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v4, v4 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v7 +v_mov_b32_e32 v108, v8 +v_mov_b32_e32 v109, v9 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v6 +v_add_f32_dpp v106, v6, v6 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v7, v7 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v8, v8 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v9, v9 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v8, v8, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v9, v9, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v6, v6, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v7, v7, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v7, v8 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v6, v9 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v7, v7 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v7, v7, v7 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v6, v6 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v6, v6, v6 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v9, v107 +v_add_f32_dpp v9, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v8, v106 +v_add_f32_dpp v8, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v7, v7, v6 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v5, v9, v8 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v8, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v8, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v6, v108, v7, s[52:53] +v_mov_b32_dpp v7, v8 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v7, v8 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v11 +v_mov_b32_e32 v108, v12 +v_mov_b32_e32 v109, v13 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v10 +v_add_f32_dpp v106, v10, v10 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v11, v11 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v12, v12 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v13, v13 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v12, v12, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v13, v13, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v10, v10, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v11, v11, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v11, v12 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v10, v13 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v11, v11 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v11, v11, v11 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v10, v10 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v10, v10, v10 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v13, v107 +v_add_f32_dpp v13, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v12, v106 +v_add_f32_dpp v12, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v11, v11, v10 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v8, v13, v12 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v12, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v12, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v9, v108, v11, s[52:53] +v_mov_b32_dpp v10, v12 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v10, v12 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v15 +v_mov_b32_e32 v108, v16 +v_mov_b32_e32 v109, v17 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v14 +v_add_f32_dpp v106, v14, v14 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v15, v15 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v16, v16 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v17, v17 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v16, v16, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v17, v17, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v14, v14, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v15, v15, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v15, v16 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v14, v17 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v15, v15 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v15, v15, v15 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v14, v14 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v14, v14, v14 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v17, v107 +v_add_f32_dpp v17, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v16, v106 +v_add_f32_dpp v16, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v15, v15, v14 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v11, v17, v16 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v16, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v16, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v12, v108, v15, s[52:53] +v_mov_b32_dpp v13, v16 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v13, v16 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v19 +v_mov_b32_e32 v108, v20 +v_mov_b32_e32 v109, v21 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v18 +v_add_f32_dpp v106, v18, v18 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v19, v19 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v20, v20 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v21, v21 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v20, v20, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v21, v21, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v18, v18, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v19, v19, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v19, v20 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v18, v21 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v19, v19 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v19, v19, v19 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v18, v18 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v18, v18, v18 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v21, v107 +v_add_f32_dpp v21, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v20, v106 +v_add_f32_dpp v20, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v19, v19, v18 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v14, v21, v20 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v20, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v20, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v15, v108, v19, s[52:53] +v_mov_b32_dpp v16, v20 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v16, v20 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v23 +v_mov_b32_e32 v108, v24 +v_mov_b32_e32 v109, v25 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v22 +v_add_f32_dpp v106, v22, v22 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v23, v23 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v24, v24 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v25, v25 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v24, v24, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v25, v25, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v22, v22, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v23, v23, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v23, v24 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v22, v25 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v23, v23 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v23, v23, v23 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v22, v22 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v22, v22, v22 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v25, v107 +v_add_f32_dpp v25, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v24, v106 +v_add_f32_dpp v24, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v23, v23, v22 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v17, v25, v24 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v24, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v24, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v18, v108, v23, s[52:53] +v_mov_b32_dpp v19, v24 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v19, v24 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v27 +v_mov_b32_e32 v108, v28 +v_mov_b32_e32 v109, v29 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v26 +v_add_f32_dpp v106, v26, v26 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v27, v27 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v28, v28 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v29, v29 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v28, v28, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v29, v29, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v26, v26, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v27, v27, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v27, v28 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v26, v29 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v27, v27 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v27, v27, v27 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v26, v26 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v26, v26, v26 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v29, v107 +v_add_f32_dpp v29, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v28, v106 +v_add_f32_dpp v28, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v27, v27, v26 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v20, v29, v28 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v28, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v28, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v21, v108, v27, s[52:53] +v_mov_b32_dpp v22, v28 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v22, v28 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_mov_b32_e32 v107, v31 +v_mov_b32_e32 v108, v32 +v_mov_b32_e32 v109, v33 +s_waitcnt lgkmcnt(0) +v_mov_b32_e32 v106, v30 +v_add_f32_dpp v106, v30, v30 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v107, v31, v31 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v108, v32, v32 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_add_f32_dpp v109, v33, v33 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v32, v32, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v33, v33, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v30, v30, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mac_f32_dpp v31, v31, v101 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xc +v_mov_b32_dpp v31, v32 row_mirror row_mask:0xf bank_mask:0x3 +v_mov_b32_dpp v30, v33 row_mirror row_mask:0xf bank_mask:0x3 +v_add_f32_dpp v107, v108, v107 row_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v106, v109, v106 row_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v109, v31, v31 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v31, v31, v31 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_sub_f32_dpp v108, v30, v30 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v30, v30, v30 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_e32 v33, v107 +v_add_f32_dpp v33, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v109 row_ror:12 row_mask:0xf bank_mask:0x1 +v_mov_b32_dpp v108, v109 row_ror:4 row_mask:0xf bank_mask:0x8 +v_mov_b32_e32 v32, v106 +v_add_f32_dpp v32, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v108, v108 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0x3 +v_sub_f32_dpp v109, v107, v107 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_add_f32_dpp v31, v31, v30 row_half_mirror row_mask:0xf bank_mask:0xf +v_add_f32_dpp v23, v33, v32 row_half_mirror row_mask:0xf bank_mask:0xf +v_sub_f32_dpp v32, v106, v106 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0x6 +v_mov_b32_dpp v32, v109 row_half_mirror row_mask:0xf bank_mask:0x9 +v_mov_b32_dpp v108, v108 quad_perm:[2,2,2,2] row_mask:0xf bank_mask:0xc +v_cndmask_b32_e64 v24, v108, v31, s[52:53] +v_mov_b32_dpp v25, v32 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0x5 +s_nop 1 +v_mov_b32_dpp v25, v32 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +s_waitcnt vmcnt(0) +v_readlane_b32 s55, v104, 0 +v_add_f32_e64 v2, v2, s55 +v_mul_f32_e64 v106, v2, s36 +v_cmp_lt_f32_e64 vcc, v2, 0 +v_cndmask_b32_e32 v2, v2, v106, vcc +v_add_f32_e64 v3, v3, s55 +v_mul_f32_e64 v106, v3, s36 +v_cmp_lt_f32_e64 vcc, v3, 0 +v_cndmask_b32_e32 v3, v3, v106, vcc +v_add_f32_e64 v4, v4, s55 +v_mul_f32_e64 v106, v4, s36 +v_cmp_lt_f32_e64 vcc, v4, 0 +v_cndmask_b32_e32 v4, v4, v106, vcc +buffer_store_dword v2, v86, s[44:47], 0 offen +buffer_store_dword v3, v87, s[44:47], 0 offen +buffer_store_dword v4, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 1 +v_add_f32_e64 v5, v5, s55 +v_mul_f32_e64 v106, v5, s36 +v_cmp_lt_f32_e64 vcc, v5, 0 +v_cndmask_b32_e32 v5, v5, v106, vcc +v_add_f32_e64 v6, v6, s55 +v_mul_f32_e64 v106, v6, s36 +v_cmp_lt_f32_e64 vcc, v6, 0 +v_cndmask_b32_e32 v6, v6, v106, vcc +v_add_f32_e64 v7, v7, s55 +v_mul_f32_e64 v106, v7, s36 +v_cmp_lt_f32_e64 vcc, v7, 0 +v_cndmask_b32_e32 v7, v7, v106, vcc +buffer_store_dword v5, v86, s[44:47], 0 offen +buffer_store_dword v6, v87, s[44:47], 0 offen +buffer_store_dword v7, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 2 +v_add_f32_e64 v8, v8, s55 +v_mul_f32_e64 v106, v8, s36 +v_cmp_lt_f32_e64 vcc, v8, 0 +v_cndmask_b32_e32 v8, v8, v106, vcc +v_add_f32_e64 v9, v9, s55 +v_mul_f32_e64 v106, v9, s36 +v_cmp_lt_f32_e64 vcc, v9, 0 +v_cndmask_b32_e32 v9, v9, v106, vcc +v_add_f32_e64 v10, v10, s55 +v_mul_f32_e64 v106, v10, s36 +v_cmp_lt_f32_e64 vcc, v10, 0 +v_cndmask_b32_e32 v10, v10, v106, vcc +buffer_store_dword v8, v86, s[44:47], 0 offen +buffer_store_dword v9, v87, s[44:47], 0 offen +buffer_store_dword v10, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 3 +v_add_f32_e64 v11, v11, s55 +v_mul_f32_e64 v106, v11, s36 +v_cmp_lt_f32_e64 vcc, v11, 0 +v_cndmask_b32_e32 v11, v11, v106, vcc +v_add_f32_e64 v12, v12, s55 +v_mul_f32_e64 v106, v12, s36 +v_cmp_lt_f32_e64 vcc, v12, 0 +v_cndmask_b32_e32 v12, v12, v106, vcc +v_add_f32_e64 v13, v13, s55 +v_mul_f32_e64 v106, v13, s36 +v_cmp_lt_f32_e64 vcc, v13, 0 +v_cndmask_b32_e32 v13, v13, v106, vcc +buffer_store_dword v11, v86, s[44:47], 0 offen +buffer_store_dword v12, v87, s[44:47], 0 offen +buffer_store_dword v13, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +s_lshl_b32 s52, s67, 2 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 4 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 8 +v_add_f32_e64 v14, v14, s55 +v_mul_f32_e64 v106, v14, s36 +v_cmp_lt_f32_e64 vcc, v14, 0 +v_cndmask_b32_e32 v14, v14, v106, vcc +v_add_f32_e64 v15, v15, s55 +v_mul_f32_e64 v106, v15, s36 +v_cmp_lt_f32_e64 vcc, v15, 0 +v_cndmask_b32_e32 v15, v15, v106, vcc +v_add_f32_e64 v16, v16, s55 +v_mul_f32_e64 v106, v16, s36 +v_cmp_lt_f32_e64 vcc, v16, 0 +v_cndmask_b32_e32 v16, v16, v106, vcc +buffer_store_dword v14, v86, s[44:47], 0 offen +buffer_store_dword v15, v87, s[44:47], 0 offen +buffer_store_dword v16, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 9 +v_add_f32_e64 v17, v17, s55 +v_mul_f32_e64 v106, v17, s36 +v_cmp_lt_f32_e64 vcc, v17, 0 +v_cndmask_b32_e32 v17, v17, v106, vcc +v_add_f32_e64 v18, v18, s55 +v_mul_f32_e64 v106, v18, s36 +v_cmp_lt_f32_e64 vcc, v18, 0 +v_cndmask_b32_e32 v18, v18, v106, vcc +v_add_f32_e64 v19, v19, s55 +v_mul_f32_e64 v106, v19, s36 +v_cmp_lt_f32_e64 vcc, v19, 0 +v_cndmask_b32_e32 v19, v19, v106, vcc +buffer_store_dword v17, v86, s[44:47], 0 offen +buffer_store_dword v18, v87, s[44:47], 0 offen +buffer_store_dword v19, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 10 +v_add_f32_e64 v20, v20, s55 +v_mul_f32_e64 v106, v20, s36 +v_cmp_lt_f32_e64 vcc, v20, 0 +v_cndmask_b32_e32 v20, v20, v106, vcc +v_add_f32_e64 v21, v21, s55 +v_mul_f32_e64 v106, v21, s36 +v_cmp_lt_f32_e64 vcc, v21, 0 +v_cndmask_b32_e32 v21, v21, v106, vcc +v_add_f32_e64 v22, v22, s55 +v_mul_f32_e64 v106, v22, s36 +v_cmp_lt_f32_e64 vcc, v22, 0 +v_cndmask_b32_e32 v22, v22, v106, vcc +buffer_store_dword v20, v86, s[44:47], 0 offen +buffer_store_dword v21, v87, s[44:47], 0 offen +buffer_store_dword v22, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +v_readlane_b32 s55, v104, 11 +v_add_f32_e64 v23, v23, s55 +v_mul_f32_e64 v106, v23, s36 +v_cmp_lt_f32_e64 vcc, v23, 0 +v_cndmask_b32_e32 v23, v23, v106, vcc +v_add_f32_e64 v24, v24, s55 +v_mul_f32_e64 v106, v24, s36 +v_cmp_lt_f32_e64 vcc, v24, 0 +v_cndmask_b32_e32 v24, v24, v106, vcc +v_add_f32_e64 v25, v25, s55 +v_mul_f32_e64 v106, v25, s36 +v_cmp_lt_f32_e64 vcc, v25, 0 +v_cndmask_b32_e32 v25, v25, v106, vcc +buffer_store_dword v23, v86, s[44:47], 0 offen +buffer_store_dword v24, v87, s[44:47], 0 offen +buffer_store_dword v25, v88, s[44:47], 0 offen +s_add_u32 s44, s44, s67 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 1 +s_cselect_b32 s47, 0, s47 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_lshl_b32 s52, s52, 2 +s_add_u32 s44, s44, s52 +s_addc_u32 s45, s45, 0 +s_sub_u32 s93, s93, 20 +s_cselect_b32 s47, 0, s47 +s_cselect_b32 s51, 0, s51 +s_add_u32 s48, s48, 0x80 +s_addc_u32 s49, s49, 0 +s_sub_u32 s50, s50, 0x80 +s_cselect_b32 s51, 0, s51 +v_mov_b32_e32 v2, 0 +v_mov_b32_e32 v3, 0 +v_mov_b32_e32 v4, 0 +v_mov_b32_e32 v5, 0 +v_mov_b32_e32 v6, 0 +v_mov_b32_e32 v7, 0 +v_mov_b32_e32 v8, 0 +v_mov_b32_e32 v9, 0 +v_mov_b32_e32 v10, 0 +v_mov_b32_e32 v11, 0 +v_mov_b32_e32 v12, 0 +v_mov_b32_e32 v13, 0 +v_mov_b32_e32 v14, 0 +v_mov_b32_e32 v15, 0 +v_mov_b32_e32 v16, 0 +v_mov_b32_e32 v17, 0 +v_mov_b32_e32 v18, 0 +v_mov_b32_e32 v19, 0 +v_mov_b32_e32 v20, 0 +v_mov_b32_e32 v21, 0 +v_mov_b32_e32 v22, 0 +v_mov_b32_e32 v23, 0 +v_mov_b32_e32 v24, 0 +v_mov_b32_e32 v25, 0 +v_mov_b32_e32 v26, 0 +v_mov_b32_e32 v27, 0 +v_mov_b32_e32 v28, 0 +v_mov_b32_e32 v29, 0 +v_mov_b32_e32 v30, 0 +v_mov_b32_e32 v31, 0 +v_mov_b32_e32 v32, 0 +v_mov_b32_e32 v33, 0 +s_xor_b32 s18, s18, 0x200000 +s_mul_i32 s94, s60, s61 +s_mul_i32 s94, s94, s13 +s_add_u32 s52, s93, s92 +s_cmp_lt_i32 s52, 0 +s_cbranch_scc0 156 +v_and_b32_e32 v86, 0x7f, v0 +v_lshrrev_b32_e32 v86, 1, v86 +v_bfi_b32 v86, 1, v0, v86 +v_and_b32_e64 v87, v0, 2 +v_mad_u32_u24 v86, v87, 16, v86 +v_lshlrev_b32_e32 v86, 2, v86 +v_add_co_u32_e64 v86, vcc, v86, s97 +v_and_b32_e32 v87, 3, v0 +v_lshlrev_b32_e32 v87, 2, v87 +v_add_co_u32_e64 v87, vcc, v87, s97 +ds_read_b32 v108, v87 offset:256 +ds_read_b32 v86, v86 +s_add_u32 s97, s97, 0x18c +s_cmp_eq_u32 s97, 0xffc0 +s_cselect_b32 s97, 0xc1e0, s97 +s_waitcnt lgkmcnt(0) +v_readfirstlane_b32 s95, v86 +v_readlane_b32 s54, v108, 0 +s_bitcmp1_b32 s54, 18 +s_cbranch_scc1 131 +v_readlane_b32 s52, v108, 1 +v_readlane_b32 s53, v108, 2 +s_add_u32 s93, s92, s53 +s_lshr_b32 s55, -1, 16 +s_and_b32 s55, s55, s66 +s_lshr_b32 s56, s66, 16 +s_mul_i32 s56, s56, s95 +s_mul_i32 s44, s55, s95 +s_lshl_b32 s55, s56, 16 +s_lshr_b32 s56, s56, 16 +s_add_u32 s44, s55, s44 +s_addc_u32 s45, s56, 0 +s_add_u32 s44, s44, s24 +s_addc_u32 s45, s45, s25 +s_mul_i32 s55, s67, s93 +s_add_u32 s44, s44, s55 +s_addc_u32 s45, s45, 0 +s_mov_b32 s47, 0x20000 +s_bitcmp1_b32 s18, 7 +s_cselect_b32 s51, 0x20000, 0 +s_lshl_b32 s55, s93, 2 +s_add_u32 s48, s34, s55 +s_addc_u32 s49, s35, 0 +s_lshl_b32 s56, s52, 2 +s_sub_u32 s50, s56, s55 +s_cselect_b32 s51, 0, s51 +s_sub_u32 s93, s52, s53 +s_sub_u32 s93, s93, 1 +s_sub_u32 s93, s93, s92 +s_cselect_b32 s47, 0, s47 +v_bfe_u32 v106, v86, 16, 16 +v_bfe_u32 v107, v86, 0, 16 +v_and_b32_e64 v108, v0, 7 +v_sub_co_u32_e32 v109, vcc, 7, v108 +v_min_u32_e32 v108, v108, v109 +v_bfe_u32 v109, v108, 1, 1 +v_bfe_u32 v108, v108, 0, 1 +v_mov_b32_dpp v106, v106 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +v_mov_b32_dpp v107, v107 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +v_add_co_u32_e32 v106, vcc, v106, v109 +v_add_co_u32_e32 v107, vcc, v107, v108 +v_mov_b32_dpp v108, v86 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf +v_cmp_ge_u32_e64 s[52:53], v108, s12 +v_sub_co_u32_e64 v108, vcc, v108, s95 +v_mul_lo_u32 v108, v108, s66 +v_xor_b32_dpp v109, v0, v0 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v109, v0, v0 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v89, v0, v0 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v89, v0, v0 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xa +v_add_co_u32_e32 v89, vcc, v107, v89 +v_add_co_u32_e32 v109, vcc, v106, v109 +v_mad_i32_i24 v86, v109, s33, v89 +v_lshlrev_b32_e32 v86, 2, v86 +v_add_co_u32_e32 v86, vcc, v86, v108 +v_cmp_ge_u32_e64 s[56:57], v89, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v109, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v86, v86, -1, s[56:57] +v_xor_b32_dpp v109, v0, v0 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v109, v0, v0 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v89, v0, v0 quad_perm:[1,1,2,2] row_mask:0xf bank_mask:0xf +v_add_co_u32_e32 v89, vcc, v107, v89 +v_add_co_u32_e32 v109, vcc, v106, v109 +v_mad_i32_i24 v87, v109, s33, v89 +v_lshlrev_b32_e32 v87, 2, v87 +v_add_co_u32_e32 v87, vcc, v87, v108 +v_cmp_ge_u32_e64 s[56:57], v89, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v109, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v87, v87, -1, s[56:57] +v_xor_b32_dpp v109, v0, v0 quad_perm:[0,1,3,2] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v109, v0, v0 quad_perm:[1,0,2,3] row_mask:0xf bank_mask:0xa +v_xor_b32_dpp v89, v0, v0 quad_perm:[1,1,3,3] row_mask:0xf bank_mask:0xf +v_xor_b32_dpp v89, v0, v0 quad_perm:[0,0,2,2] row_mask:0xf bank_mask:0xa +v_add_co_u32_e32 v89, vcc, v107, v89 +v_add_co_u32_e32 v109, vcc, v106, v109 +v_mad_i32_i24 v88, v109, s33, v89 +v_lshlrev_b32_e32 v88, 2, v88 +v_add_co_u32_e32 v88, vcc, v88, v108 +v_cmp_ge_u32_e64 s[56:57], v89, s33 +s_or_b64 s[56:57], s[56:57], s[52:53] +v_cmp_ge_u32_e64 s[54:55], v109, s32 +s_or_b64 s[56:57], s[56:57], s[54:55] +v_cndmask_b32_e64 v88, v88, -1, s[56:57] +v_and_b32_e64 v104, v0, 63 +v_lshlrev_b32_e32 v104, 2, v104 +s_barrier +buffer_load_dword v104, v104, s[48:51], 0 offen +s_branch 63895 +s_endpgm +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 +s_nop 0 diff --git a/src/kernels/Conv_Winograd_v21_1_2_metadata.inc b/src/kernels/Conv_Winograd_v21_1_2_metadata.inc index bdf66e32a6..d66fdd56c3 100644 --- a/src/kernels/Conv_Winograd_v21_1_2_metadata.inc +++ b/src/kernels/Conv_Winograd_v21_1_2_metadata.inc @@ -211,11 +211,11 @@ METADATA_WRAPPER total_sgpr_count,.amdgcn.next_free_vgpr,workgroup_size_x, <\ker .endm .macro PROLOG_KERNEL_DESCRIPTOR_WRAPPER machine_version, kernel_name_postfix - PROLOG_KERNEL_DESCRIPTOR miopenSp3AsmConv_v21_1_2_gfx\machine_version\()_\kernel_name_postfix + PROLOG_KERNEL_DESCRIPTOR miopenSp3AsmConv_v21_1_2_gfx\machine_version\()_\kernel_name_postfix .endm .macro EPILOG_KERNEL_DESCRIPTOR_WRAPPER machine_version, kernel_name_postfix - EPILOG_KERNEL_DESCRIPTOR miopenSp3AsmConv_v21_1_2_gfx\machine_version\()_\kernel_name_postfix + EPILOG_KERNEL_DESCRIPTOR miopenSp3AsmConv_v21_1_2_gfx\machine_version\()_\kernel_name_postfix .endm .macro KERNEL_PROLOG kernel_name_postfix diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index fcdd9fb806..bd85bd2111 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -212,6 +212,7 @@ static auto GetImplicitGemmWrWSolvers() static auto GetWindogradWrWSolvers() { return miopen::solver::SolverContainer, diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp index 3d54a24cd6..a7b8c3995c 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp @@ -872,7 +872,7 @@ static std::tuple 0; if(is_gemm_not_empty) { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp index 5088980242..4893e805e5 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp @@ -198,7 +198,7 @@ PerformanceImplicitGemmBwdDataV4R1Xdlops::CalculateGemmBBlockCopyPerformancePara // calculate threadwise copy size int b_data_per_thread_copy = - std::max(1, (GemmKPerBlock * GemmMPerBlock * GemmKPACKSize) / BlockSize); + std::max(1, (GemmKPerBlock * GemmNPerBlock * GemmKPACKSize) / BlockSize); if(!(b_data_per_thread_copy > 0)) MIOPEN_THROW("invalid performance parameter"); diff --git a/src/solver/conv_mlir_igemm_bwd.cpp b/src/solver/conv_mlir_igemm_bwd.cpp index 6da98a206a..e1498eee0e 100644 --- a/src/solver/conv_mlir_igemm_bwd.cpp +++ b/src/solver/conv_mlir_igemm_bwd.cpp @@ -73,7 +73,7 @@ std::string GetOperation() { return "conv2d_bwd_data"; } bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx) const { #if MIOPEN_USE_MLIR - if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD{})) + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD{})) return false; if(!ctx.IsLayoutDefault() && !ctx.IsLayoutNHWC()) return false; diff --git a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp index 6badd686ae..e679856f73 100644 --- a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp @@ -74,7 +74,7 @@ std::string GetOperation() { return "conv2d_bwd_data"; } bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx) const { #if MIOPEN_USE_MLIR - if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS{})) + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS{})) return false; if(!ctx.IsLayoutDefault() && !ctx.IsLayoutNHWC()) return false; @@ -118,10 +118,10 @@ ConvSolution ConvMlirIgemmBwdXdlops::GetSolution(const ConvolutionContext& ctx) { KernelInfo construction_parameters; - construction_parameters.kernel_name = GetKernelName() + std::to_string(kernel_id); - construction_parameters.kernel_file = construction_parameters.kernel_name + ".mlir"; - construction_parameters.comp_options = - mlir::ConstructBuildOptions(ctx, GetOperation(), GetKernelName(), true, kernel_id); + construction_parameters.kernel_name = GetKernelName() + std::to_string(kernel_id); + construction_parameters.kernel_file = construction_parameters.kernel_name + ".mlir"; + construction_parameters.comp_options = mlir::ConstructBuildOptions( + ctx, GetOperation(), construction_parameters.kernel_name, true, kernel_id); size_t local_size = 0; size_t global_size = 0; diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp index fd87eeb980..11a3d40366 100644 --- a/src/solver/conv_mlir_igemm_fwd.cpp +++ b/src/solver/conv_mlir_igemm_fwd.cpp @@ -38,27 +38,6 @@ namespace solver { namespace { #if MIOPEN_USE_MLIR -std::tuple CalculateGemmSize(const ConvolutionContext& ctx) -{ - const size_t g = ConvolutionContextInterpreter::GetGroupCountG(ctx); - const size_t n = ConvolutionContextInterpreter::GetBatchN(ctx); - const size_t k = ConvolutionContextInterpreter::GetOutputChannelK(ctx); - const size_t c = ConvolutionContextInterpreter::GetInputChannelC(ctx); - const size_t ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); - const size_t wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); - const size_t y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); - const size_t x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); - - const auto k_per_group = k / g; - const auto c_per_group = c / g; - - const auto gemm_m = k_per_group; - const auto gemm_n = n * ho * wo; - const auto gemm_k_total = c_per_group * y * x; - - return std::make_tuple(gemm_m, gemm_n, gemm_k_total); -} - std::string GetKernelName() { std::string version = "_v4r4"; @@ -87,14 +66,6 @@ bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx) const if(!ctx.IsFp32() && !ctx.IsFp16()) return false; - int gemm_m = 0; - int gemm_n = 0; - int gemm_k = 0; - - std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx); - if(!(gemm_m % 32 == 0 && gemm_n % 32 == 0 && gemm_k % 4 == 0)) - return false; - return MiirIsConfigApplicable( mlir::ConstructBuildOptions(ctx, GetOperation(), GetKernelName(), false)); #else diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp index acc908a938..0f3d96b172 100644 --- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp @@ -39,27 +39,6 @@ namespace solver { namespace { #if MIOPEN_USE_MLIR -std::tuple CalculateGemmSize(const ConvolutionContext& ctx) -{ - const size_t g = ConvolutionContextInterpreter::GetGroupCountG(ctx); - const size_t n = ConvolutionContextInterpreter::GetBatchN(ctx); - const size_t k = ConvolutionContextInterpreter::GetOutputChannelK(ctx); - const size_t c = ConvolutionContextInterpreter::GetInputChannelC(ctx); - const size_t ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); - const size_t wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); - const size_t y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); - const size_t x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); - - const auto k_per_group = k / g; - const auto c_per_group = c / g; - - const auto gemm_m = k_per_group; - const auto gemm_n = n * ho * wo; - const auto gemm_k_total = c_per_group * y * x; - - return std::make_tuple(gemm_m, gemm_n, gemm_k_total); -} - std::string GetKernelName() { std::string version = "_v4r4"; @@ -90,15 +69,6 @@ bool ConvMlirIgemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx) const if(!ctx.IsFp32() && !ctx.IsFp16()) return false; - int gemm_m = 0; - int gemm_n = 0; - int gemm_k = 0; - - std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx); - - if(!IsValidGridGemmXdlops(gemm_m, gemm_n, gemm_k)) - return false; - return MiirIsConfigApplicable( mlir::ConstructBuildOptions(ctx, GetOperation(), GetKernelName(), true)); #else diff --git a/src/solver/conv_mlir_igemm_wrw.cpp b/src/solver/conv_mlir_igemm_wrw.cpp index e2680d394b..6e72226246 100644 --- a/src/solver/conv_mlir_igemm_wrw.cpp +++ b/src/solver/conv_mlir_igemm_wrw.cpp @@ -39,27 +39,6 @@ namespace solver { namespace { #if MIOPEN_USE_MLIR -std::tuple CalculateGemmSize(const ConvolutionContext& ctx) -{ - const auto g = ConvolutionContextInterpreter::GetGroupCountG(ctx); - const size_t n = ConvolutionContextInterpreter::GetBatchN(ctx); - const size_t c = ConvolutionContextInterpreter::GetInputChannelC(ctx); - const size_t k = ConvolutionContextInterpreter::GetOutputChannelK(ctx); - const size_t ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); - const size_t wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); - const size_t y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); - const size_t x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); - - const auto k_per_group = k / g; - const auto c_per_group = c / g; - - const auto gemm_m = k_per_group; - const auto gemm_n = c_per_group * y * x; - const auto gemm_k_total = n * ho * wo; - - return std::make_tuple(gemm_m, gemm_n, gemm_k_total); -} - std::string GetKernelName() { std::string version = "_v4r4"; @@ -90,14 +69,6 @@ bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx) const if(!ctx.IsFp32() && !ctx.IsFp16()) return false; - int gemm_m = 0; - int gemm_n = 0; - int gemm_k = 0; - - std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx); - if(!(gemm_m % 32 == 0 && gemm_n % 32 == 0 && gemm_k % 4 == 0)) - return false; - return MiirIsConfigApplicable( mlir::ConstructBuildOptions(ctx, GetOperation(), GetKernelName(), false)); #else diff --git a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp index b566d22415..b56b070817 100644 --- a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp @@ -40,28 +40,6 @@ namespace solver { namespace { #if MIOPEN_USE_MLIR - -std::tuple CalculateGemmSize(const ConvolutionContext& ctx) -{ - const auto g = ConvolutionContextInterpreter::GetGroupCountG(ctx); - const size_t n = ConvolutionContextInterpreter::GetBatchN(ctx); - const size_t c = ConvolutionContextInterpreter::GetInputChannelC(ctx); - const size_t k = ConvolutionContextInterpreter::GetOutputChannelK(ctx); - const size_t ho = ConvolutionContextInterpreter::GetOutputHeightHo(ctx); - const size_t wo = ConvolutionContextInterpreter::GetOutputWidthWo(ctx); - const size_t y = ConvolutionContextInterpreter::GetFilterHeightY(ctx); - const size_t x = ConvolutionContextInterpreter::GetFilterWidthX(ctx); - - const auto k_per_group = k / g; - const auto c_per_group = c / g; - - const auto gemm_m = k_per_group; - const auto gemm_n = c_per_group * y * x; - const auto gemm_k_total = n * ho * wo; - - return std::make_tuple(gemm_m, gemm_n, gemm_k_total); -} - std::string GetKernelName() { std::string version = "_v4r4"; @@ -93,15 +71,6 @@ bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx) const if(!ctx.IsFp32() && !ctx.IsFp16()) return false; - int gemm_m = 0; - int gemm_n = 0; - int gemm_k = 0; - - std::tie(gemm_m, gemm_n, gemm_k) = CalculateGemmSize(ctx); - - if(!IsValidGridGemmXdlops(gemm_m, gemm_n, gemm_k)) - return false; - return MiirIsConfigApplicable( mlir::ConstructBuildOptions(ctx, GetOperation(), GetKernelName(), true)); #else diff --git a/src/solver/conv_winoRxS_f3x2.cpp b/src/solver/conv_winoRxS_f3x2.cpp index 86e02f99a5..6ace9a4eef 100644 --- a/src/solver/conv_winoRxS_f3x2.cpp +++ b/src/solver/conv_winoRxS_f3x2.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2017 Advanced Micro Devices, Inc. + * Copyright (c) 2021 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,158 +32,140 @@ #include #include #include +#include +#include +#include #include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2_PERF_VALS) -/// \return v rounded up (towards +inf) to the nearest multiple of m. -/// Defined for positive values only. -static inline int Ceiling(const int v, const int m) -{ - assert(m > 0 && v >= 0); - if(v % m != 0) - { - return (v / m + 1) * m; - } - return v; -} +#define WINODATA 3 +#define WINOFILTER 2 +#define MAX_CU_LIMIT 512 -/// \return Value equivalent to ceil(x/y). -/// Defined for positive values only. -static inline int CeilDiv(const int x, const int y) +static inline size_t Ceil(const size_t v, const size_t m) { - assert(y > 0); - return Ceiling(x, y) / y; + assert(m > 0); + return (v + m - 1) / m; } -/// \return Value equivalent to floor(x/y). -/// Defined for positive values only. -static inline int FloorDiv(const int x, const int y) +static inline size_t RoundUpToMultiple(size_t val, size_t factor) { - assert(x >= 0 && y > 0); - return x / y; + return Ceil(val, factor) * factor; } -/// \todo Consider re-using code from RxS. -static inline bool IsShaderContraintsMet(const int R, - const int S, - const int R_stride, - const int S_stride, - const int C, - const int K, - const int H, - const int W, - const int OH, - const int OW, - const int N, - const miopen::ConvolutionContext& params, - const bool fp16, - const unsigned filter_tile_size) +/// \todo Consider re-using code from RxS_f2x3. +static inline int GetBestNGroupParam(const int R, + const int S, + const int R_stride, + const int S_stride, + const int C, + const int K, + const int OH, + const int OW, + const int pad_H, + const int pad_W, + const int N, + const int idilation_w, + const int idilation_h, + const int n_groups, + const int G) { - const auto TILE = static_cast(filter_tile_size); - const int TILE_X2 = TILE * 2; - // Calculate padded filter size first. - // If stride = 1: if S <= 3 it is padded to 3, - // otherwise S is padded to smallest 6*n for some integer n - // If stride = 2: S is always padded to smallest 6*n for some integer n - int padded_S = 0; - if(S_stride == 1) - { - if(S <= TILE) - { - padded_S = TILE; - } - else - { - padded_S = Ceiling(S, TILE_X2); - } - } - else - { - padded_S = Ceiling(S, TILE_X2); - } - // If stride = 1: R is always padded to smallest 3*m for some integer m - // If stride = 2: if R % 6 ==1 then R is padded to smallest 3*m for some - // integer m, otherwise R is padded to smallest 6*m for some integer m - int padded_R = 0; - if(R_stride == 1) - { - padded_R = Ceiling(R, TILE); - } - else + int o_tile = WINODATA; + int f_tile = WINOFILTER; + int r_factor = f_tile * 2; + int s_factor = r_factor; + int c_factor = 2; + int k_factor = 32; + int nwh_factor = 32; + int w_factor = o_tile * idilation_w * S_stride; + int h_factor = o_tile * idilation_h * R_stride; + + if(S_stride == 1 && idilation_w == 1 && S <= f_tile) + s_factor = f_tile; + if((R_stride == 1 && idilation_h == 1) || (R % (f_tile * 2)) == 1) + r_factor = f_tile; + if(S_stride == 2 || R_stride == 2 || idilation_w == 2 || idilation_h == 2) + c_factor = 1; + + size_t g_s = RoundUpToMultiple(S, s_factor); + size_t g_r = RoundUpToMultiple(R, r_factor); + size_t g_c = RoundUpToMultiple(C, c_factor); + size_t g_k = RoundUpToMultiple(K, k_factor); + size_t g_w = OW; + size_t g_h = OH; + + if((pad_W % 2 == 0) && (idilation_w > 1 || S_stride > 1)) + g_w += 1; + if((pad_H % 2 == 1) && (idilation_h > 1 || R_stride > 1)) + g_h += 1; + + g_w = RoundUpToMultiple(g_w, w_factor); + g_h = RoundUpToMultiple(g_h, h_factor); + size_t g_n_w_h = RoundUpToMultiple(g_w * g_h * N, nwh_factor * w_factor * h_factor); + + int best_n_groups_cnt = 1; + double min_param = 0; + for(auto i = 1; i < n_groups; ++i) { - if(R % TILE_X2 == 1) - { - padded_R = Ceiling(R, TILE); - } + size_t g_n_w_h_k = + RoundUpToMultiple(g_n_w_h * g_k, nwh_factor * w_factor * h_factor * k_factor * i); + size_t granulated_mac_count = g_n_w_h_k * g_c * g_s * g_r; + size_t n_groups_per_cu = Ceil(i * G, n_groups); + double perf_metric = static_cast(n_groups_per_cu) * granulated_mac_count / i; + if(static_cast(granulated_mac_count) / i > 1.0e+7) + perf_metric *= (1 + i * 0.003); else + perf_metric *= (1 + i * 0.04); + if(i == 1) + min_param = perf_metric; + if(min_param > perf_metric) { - padded_R = Ceiling(R, TILE_X2); - } - } - // Check C restrictions: - // For FP16, all C restrictions shall be multipled by 2. - // This implicitly introduces restriction that C must be even. - if(fp16 && C % 2 != 0) - { - return false; - } - // If stride == 1 and S <= 3 then C needs to be even, otherwise not - if(S_stride == 1 && S <= TILE && C % (fp16 ? 4 : 2) != 0) - { - return false; - } - const bool is_dilated_stride_2 = (params.direction.IsBackwardData() && S_stride != 1); - if(fp16) - { - if(is_dilated_stride_2) - { - if(C % 4 != 0) - return false; - // In dilation mode with stride== 2 the following should be satisfied: - // C * (ceil(R/6) + floor((R+4)/6)) * ceil(S/6) >= 18*2 (fp16) - const auto k = CeilDiv(R, TILE_X2) + FloorDiv((R + TILE + 1), TILE_X2); - const auto l = CeilDiv(S, TILE_X2); - if(C * k * l < 18 * 2) - return false; - } - if(padded_R * padded_S * C < TILE * TILE * 18 * 2) - return false; - } - else - { - // 9_0_14 readme: Additional limitations in the dilated case are R> 1 and C %2==0 - if(is_dilated_stride_2) - { - if(!(R > 1)) - return false; - if(!(C % 2 == 0)) - return false; + best_n_groups_cnt = i; + min_param = perf_metric; } - // If the padded_R x padded_S filter size from above is 3*k x 3*l - // or (special case for dilated with stride 2) 3*k x 6*l, then - // it should be that k*l*C >=18 - assert(padded_R % TILE == 0 && padded_S % (is_dilated_stride_2 ? TILE_X2 : TILE) == 0); - const int k = padded_R / TILE; - const int l = padded_S / (is_dilated_stride_2 ? TILE_X2 : TILE); - if(k * l * C < 18) - return false; } + return best_n_groups_cnt; +} + +namespace miopen { +namespace solver { + +namespace { +// clang-format off +auto PerfFieldRules() +{ + return seq::MakeRuleSet( + std::make_tuple(seq::Span{}, &PerformanceConfigConvBinWinogradRxSf3x2::n_groups) + ); +} +// clang-format on + +/// \todo Consider re-using code from RxS_f2x3. +inline bool IsShaderContraintsMet(const int R, + const int S, + const int C, + const int K, + const int H, + const int W, + const int OH, + const int OW, + const int N, + const ConvolutionContext& params) +{ // Padding for bwd data shall not be negative. /// \todo Either remove WrW related code or re-use function from RxS - if(params.direction.IsBackwardData() || params.direction.IsBackwardWrW()) + if(params.direction.IsBackwardData()) { if(!(0 <= params.GetBackwardPadW() && params.GetBackwardPadW() < std::pow(2, 16))) return false; if(!(0 <= params.GetBackwardPadH() && params.GetBackwardPadH() < std::pow(2, 16))) return false; } - const auto grid_workgroup_count_x = params.GetStream().GetMaxComputeUnits(); - if(!params.IsLayoutDefault()) - { - return false; - } + + const auto grid_workgroup_count_x = params.GetStream().GetMaxHardwareComputeUnits(); // clang-format off // Check implementation limits. @@ -207,8 +189,115 @@ static inline bool IsShaderContraintsMet(const int R, // clang-format on } -namespace miopen { -namespace solver { +} // namespace + +PerformanceConfigConvBinWinogradRxSf3x2::PerformanceConfigConvBinWinogradRxSf3x2(int n_groups_) + : n_groups(n_groups_) +{ +} + +void PerformanceConfigConvBinWinogradRxSf3x2::HeuristicInit(const ConvolutionContext& config) +{ + const auto n_inputs_per_group = config.n_inputs / config.group_counts, + n_outputs_per_group = config.n_outputs / config.group_counts; + if(config.group_counts == 1) + { + n_groups = config.GetStream().GetMaxHardwareComputeUnits(); + return; + } + + if(config.direction.IsBackwardWrW()) + { + n_groups = GetBestNGroupParam(config.in_height, + config.in_width, + config.kernel_dilation_h, + config.kernel_dilation_w, + config.batch_sz, // N + n_inputs_per_group, // K + config.kernel_size_h, + config.kernel_size_w, + config.pad_w, + config.pad_h, + n_outputs_per_group, // C + config.kernel_stride_h, + config.kernel_stride_w, + config.GetStream().GetMaxHardwareComputeUnits(), + config.group_counts); + } + else + { + n_groups = GetBestNGroupParam(config.kernel_size_h, // RxS + config.kernel_size_w, + config.kernel_stride_h, + config.kernel_stride_w, + n_inputs_per_group, // C + n_outputs_per_group, // K + config.out_height, // OHxOW + config.out_width, + config.pad_w, + config.pad_h, + config.batch_sz, // N + config.kernel_dilation_h, + config.kernel_dilation_w, + config.GetStream().GetMaxHardwareComputeUnits(), + config.group_counts); + } +} + +bool PerformanceConfigConvBinWinogradRxSf3x2::SetNextValue() +{ + return !PerfFieldRules().Next(*this); +} + +bool PerformanceConfigConvBinWinogradRxSf3x2::IsValidValue() const +{ + return PerfFieldRules().IsIn(*this); +} + +bool PerformanceConfigConvBinWinogradRxSf3x2::IsValid(const ConvolutionContext& config) const +{ + if(config.GetStream().GetMaxHardwareComputeUnits() < n_groups) + return false; + + if(!IsValidValue()) + return false; + return true; +} + +inline bool PerformanceConfigConvBinWinogradRxSf3x2:: +operator==(const PerformanceConfigConvBinWinogradRxSf3x2& other) const +{ + return n_groups == other.n_groups; +} + +std::string PerformanceConfigConvBinWinogradRxSf3x2::ToString() const +{ + std::ostringstream ss; + Serialize(ss); + return ss.str(); +} + +PerformanceConfigConvBinWinogradRxSf3x2 +ConvBinWinogradRxSf3x2::GetPerformanceConfig(const ConvolutionContext& params) const +{ + PerformanceConfigConvBinWinogradRxSf3x2 pp; + pp.HeuristicInit(params); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} + +bool ConvBinWinogradRxSf3x2::IsValidPerformanceConfig( + const ConvolutionContext& problem, const PerformanceConfigConvBinWinogradRxSf3x2& c) const +{ + return c.IsValidValue() && c.IsValid(problem); +} + +PerformanceConfigConvBinWinogradRxSf3x2 +ConvBinWinogradRxSf3x2::Search(const ConvolutionContext& context, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, context, invoke_ctx); +} bool ConvBinWinogradRxSf3x2::IsApplicable(const ConvolutionContext& params) const { @@ -218,19 +307,19 @@ bool ConvBinWinogradRxSf3x2::IsApplicable(const ConvolutionContext& params) cons return false; if(miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2{})) return false; - if(!(params.direction.IsForward() || params.direction.IsBackwardData())) - return false; if(!params.use_asm_kernels) return false; - if(!params.rmv.IsV2orV3()) + if(!params.rmv.IsV3()) return false; if(!params.IsLayoutDefault()) - { return false; - } + + const auto max_cu = params.GetStream().GetMaxHardwareComputeUnits(); + if(max_cu > MAX_CU_LIMIT) + return false; const auto name = params.GetStream().GetDeviceName(); - if(!StartsWith(name, "gfx9") || name == "gfx90a") + if(!(StartsWith(name, "gfx9") || StartsWith(name, "gfx10")) || name == "gfx90a") return false; // clang-format off @@ -238,142 +327,349 @@ bool ConvBinWinogradRxSf3x2::IsApplicable(const ConvolutionContext& params) cons && params.kernel_stride_w == params.kernel_stride_h && params.kernel_dilation_w == 1 && params.kernel_dilation_h == 1 - && params.bias == 0 - && params.group_counts == 1 - && params.in_layout == "NCHW")) + && params.bias == 0)) return false; // clang-format on - return IsShaderContraintsMet(params.kernel_size_h, // RxS - params.kernel_size_w, - params.kernel_stride_h, - params.kernel_stride_w, - params.n_inputs, // C - params.n_outputs, // K - params.in_height, // HxW - params.in_width, - params.out_height, // OHxOW - params.out_width, - params.batch_sz, // N - params, - false, - 2); + const auto n_inputs_per_group = params.n_inputs / params.group_counts, + n_outputs_per_group = params.n_outputs / params.group_counts; + + if(params.direction.IsBackwardWrW()) + { + return IsShaderContraintsMet(params.in_height, + params.in_width, + params.batch_sz, // N + n_inputs_per_group, // K + params.out_height, + params.out_width, + params.kernel_size_h, + params.kernel_size_w, + n_outputs_per_group, // C + params); + } + else + { + return IsShaderContraintsMet(params.kernel_size_h, // RxS + params.kernel_size_w, + n_inputs_per_group, // C + n_outputs_per_group, // K + params.in_height, // HxW + params.in_width, + params.out_height, // OHxOW + params.out_width, + params.batch_sz, // N + params); + } } -ConvSolution ConvBinWinogradRxSf3x2::GetSolution(const ConvolutionContext& params) const +/// \todo Consider re-using code from RxS_f2x3. +ConvSolution +ConvBinWinogradRxSf3x2::GetSolution(const ConvolutionContext& params, + const PerformanceConfigConvBinWinogradRxSf3x2& config, + const bool disableConfigOverrideFromEnv) const { + const PerformanceConfigConvBinWinogradRxSf3x2* pcfg = &config; + PerformanceConfigConvBinWinogradRxSf3x2 fromEnv; + if(!disableConfigOverrideFromEnv) + { + std::string s; + const auto p_asciz = miopen::GetStringEnv(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2_PERF_VALS{}); + if(p_asciz != nullptr) + { + s = std::string(p_asciz); + if(!s.empty()) // else nothing to parse. + { + if(!fromEnv.Deserialize(s) || !fromEnv.IsValid(params)) + { + MIOPEN_LOG_E("MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2_PERF_VALS: " + "Bad format or invalid for the problem config: " + << s); + } + else + { + MIOPEN_LOG_I("Overridden from env: " << fromEnv.ToString()); + pcfg = &fromEnv; + } + } + } + } + ConvSolution result; - const auto n_groups = params.GetStream().GetMaxComputeUnits(); KernelInfo kernel; - kernel.g_wk.push_back(512 * n_groups); + const auto n_groups = pcfg->GetNGroups(); + const auto name = params.GetStream().GetDeviceName(); + const auto is_gfx9 = StartsWith(name, "gfx9"); + size_t wg_size = is_gfx9 ? 512 : 256; + + kernel.g_wk.push_back(wg_size * n_groups * params.group_counts); kernel.g_wk.push_back(1); kernel.g_wk.push_back(1); - kernel.l_wk.push_back(512); + kernel.l_wk.push_back(wg_size); kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); KernelBuildParameters options{ - {"ROCM_METADATA_VERSION", params.rmv.UseV3() ? 5 : 4}, + {"ROCM_METADATA_VERSION", 5}, }; kernel.comp_options = options.GenerateFor(kbp::GcnAsm{}); - kernel.kernel_name = "miopenSp3AsmConvRxSf3x2"; - kernel.kernel_file = "Conv_Winograd_v16_5_0_stride1.s"; + std::string kernel_name = "miopenSp3AsmConv_v21_1_2"; + std::string kernel_file = "Conv_Winograd_v21_1_2"; + std::string kernel_postfix = params.IsFp32() ? "_f3x2_fp32" : "_f3x2_fp16_dot2_edc"; + + if(is_gfx9) + { + kernel_name += "_gfx9"; + } + else // if(StartsWith(name, "gfx10")) + { + kernel_name += "_gfx10"; + kernel.comp_options += std::string(" -mcumode -mwavefrontsize64"); + } + + if(params.kernel_stride_w == 1) + { + kernel_postfix += "_stride1"; + } + + kernel_postfix += "_group"; + kernel.kernel_name = kernel_name + kernel_postfix; + kernel.kernel_file = kernel_file + kernel_postfix + ".s"; result.construction_params.push_back(kernel); - const auto is_forward = params.direction.IsForward(); - - constexpr int F_REVERSE_R = 1 << 0; - constexpr int F_REVERSE_S = 1 << 1; - constexpr int F_FLIP_K_C = 1 << 2; - // These are not used yet. Nevertheless let's keep as a shader documentation. - // constexpr int F_FLIP_DATA_N_C = 1 << 3; // Unsupported in f3x2. - // constexpr int F_FLIP_OUT_N_K = 1 << 4; // Unsupported in f3x2. - // constexpr int L_F_ADDR_INDIRECT = 1 << 6; - // constexpr int L_F_BIAS = 1 << 7; - // constexpr int L_F_LEAKY_RELU = 1 << 8; - constexpr int L_F_NKC_STRIDES = 1 << 9; - - int flags = is_forward ? 0 : F_REVERSE_R + F_REVERSE_S + F_FLIP_K_C; - int reserved = 0; - int* reserved_ptr = nullptr; - int N, C, H, W, K, n_groups_, out_H, out_W, R, S, pad_H, pad_W; - GetCompiledInParameters( - params, &N, &C, &H, &W, &K, &n_groups_, &out_H, &out_W, &R, &S, &pad_H, &pad_W); - MIOPEN_LOG_I2(" N=" << N << " C=" << C << " H=" << H << " W=" << W << " K=" << K << " n_groups=" - << n_groups_ - << " flags=" - << flags - << " R=" - << R - << " S=" - << S - << " pad_H=" - << pad_H - << " pad_W=" - << pad_W - << " out_H=" - << out_H - << " out_W=" - << out_W); - - flags += L_F_NKC_STRIDES; - /// \todo Consider using BufferInfo to compute strides - constexpr int SIZEOF_DATA = 4; - int d_C_stride = H * W * SIZEOF_DATA; - int d_N_stride = C * d_C_stride; - int f_C_stride = R * S * SIZEOF_DATA * (is_forward ? 1 : K); - int f_K_stride = R * S * SIZEOF_DATA * (is_forward ? C : 1); - int o_K_stride = out_H * out_W * SIZEOF_DATA; - int o_N_stride = K * o_K_stride; - - MIOPEN_LOG_I2("...flags=" << flags << " d_N_stride=" << d_N_stride << " d_C_stride=" - << d_C_stride - << " f_K_stride=" - << f_K_stride - << " f_C_stride=" - << f_C_stride - << " o_N_stride=" - << o_N_stride - << " o_K_stride=" - << o_K_stride); - - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle, const AnyInvokeParams& ctx) { - const auto k = handle.Run(kernels[0]); - const auto& fwd_ctx = ctx.CastTo(); - const auto& tensors = fwd_ctx.tensors; - - k(N, - C, - H, - W, - K, - n_groups_, - flags, - reserved, - tensors.in, - tensors.w, - tensors.out, - reserved_ptr, - R, - S, - pad_H, - pad_W, - out_H, - out_W, - reserved_ptr, - reserved, - d_N_stride, - d_C_stride, - f_K_stride, - f_C_stride, - o_N_stride, - o_K_stride); + + if(!params.direction.IsBackwardWrW()) + { + const bool is_forward = params.direction.IsForward(); + constexpr int F_REVERSE_R = 1 << 0; + constexpr int F_REVERSE_S = 1 << 1; + constexpr int F_FLIP_K_C = 1 << 2; + // These are not used yet. Nevertheless let's keep as a shader documentation. + // constexpr int F_FLIP_DATA_N_C = 1 << 3; // Unsupported in f3x2. + // constexpr int F_FLIP_OUT_N_K = 1 << 4; // Unsupported in f3x2. + // constexpr int L_F_ADDR_INDIRECT = 1 << 6; + // constexpr int L_F_BIAS = 1 << 7; + // constexpr int L_F_LEAKY_RELU = 1 << 8; + constexpr int L_F_NKC_STRIDES = 1 << 9; + constexpr int L_F_GROUP_STRIDES = 1 << 10; + // constexpr int L_F_FORCE_FILTER_TRAVERSE_MODE = 1 << 11; + // constexpr int L_F_FILTER_TRAVERSE_DUAL = 1 << 12; + // constexpr int L_F_TENSOR_OFFSETS = 1 << 13; + // constexpr int L_F_USE_EXTENDED_FLAGS_64 = 1 << 15; + int reserved = 0; + uint64_t reserved_offset = 0; + int* reserved_ptr = nullptr; + int ignore; + + int N, C, H, W, K, out_H, out_W, R, S, pad_H, pad_W; + GetCompiledInParameters( + params, &N, &C, &H, &W, &K, &ignore, &out_H, &out_W, &R, &S, &pad_H, &pad_W); + const auto group_cnt = params.group_counts; + C = C / group_cnt; + K = K / group_cnt; + int flags = is_forward ? 0 : F_REVERSE_R + F_REVERSE_S + F_FLIP_K_C; + flags |= L_F_NKC_STRIDES + L_F_GROUP_STRIDES; + + // cppcheck-suppress unreadVariable + BuffInfo d_buf(GetGroupConvLayout(GetMemLayout_t(params.in_layout), true), + N, + C, + H, + W, + group_cnt, + GetTypeSize(params.in_data_type)), + // cppcheck-suppress unreadVariable + o_buf(GetGroupConvLayout(GetMemLayout_t(params.out_layout), true), + N, + K, + out_H, + out_W, + group_cnt, + GetTypeSize(params.out_data_type)), + // cppcheck-suppress unreadVariable + f_buf(GetGroupConvLayout(is_forward ? (MemLayout_t::NCHW) + : GetSwappedNCLayout(MemLayout_t::NCHW), + false), + K, + C, + R, + S, + group_cnt, + GetTypeSize(params.weights_data_type)); + + result.invoker_factory = [=](std::vector kernels) { + return [=](const Handle& handle, const AnyInvokeParams& primitive_params) { + const auto k = handle.Run(kernels[0]); + const auto& data_ctx = primitive_params.CastTo(); + const auto& tensors = data_ctx.tensors; + + // clang-format off + MIOPEN_LOG_I2(" N=" << N << " G=" << group_cnt << " C=" << C << " H=" << H << " W=" << W << " K=" << K + << " n_groups=" << n_groups << " flags=" << flags << " R=" << R << " S=" << S + << " pad_H=" << pad_H << " pad_W=" << pad_W << " out_H=" << out_H << " out_W=" << out_W + << " d_buf.byte_stride.nk=" << d_buf.byte_stride.nk << " d_buf.byte_stride.c=" << d_buf.byte_stride.c + << " d_buf.byte_stride.h=" << d_buf.byte_stride.h << " d_buf.byte_stride.w=" << d_buf.byte_stride.w + << " f_buf.byte_stride.nk=" << f_buf.byte_stride.nk << " f_buf.byte_stride.c=" << f_buf.byte_stride.c + << " f_buf.byte_stride.h=" << f_buf.byte_stride.h << " f_buf.byte_stride.w=" << f_buf.byte_stride.w + << " o_buf.byte_stride.nk=" << o_buf.byte_stride.nk << " o_buf.byte_stride.c=" << o_buf.byte_stride.c + << " o_buf.byte_stride.h=" << o_buf.byte_stride.h << " o_buf.byte_stride.w=" << o_buf.byte_stride.w + << " d_buf.byte_stride.g=" << d_buf.byte_stride.g << " o_buf.byte_stride.g=" << o_buf.byte_stride.g + << " f_buf.byte_stride.g=" << f_buf.byte_stride.g); // clang-format on + + k(N, + C, + H, + W, + K, + n_groups, + flags, + reserved, + tensors.in, + tensors.w, + tensors.out, + reserved_ptr, // Unused return_addr. + R, + S, + pad_H, // Like Fwd wino. + pad_W, + out_H, + out_W, + reserved_ptr, // Unused bias_addr. + reserved, // Unused relu_alpha. + reserved, // Unused reserved2. + reserved_offset, // Unused d_offset. + reserved_offset, // Unused f_offset. + reserved_offset, // Unused o_offset. + reserved_offset, // Unused b_offset. + d_buf.byte_stride.nk, + d_buf.byte_stride.c, + d_buf.byte_stride.h, + d_buf.byte_stride.w, + f_buf.byte_stride.nk, + f_buf.byte_stride.c, + f_buf.byte_stride.h, + f_buf.byte_stride.w, + o_buf.byte_stride.nk, + o_buf.byte_stride.c, + o_buf.byte_stride.h, + o_buf.byte_stride.w, + group_cnt, + d_buf.byte_stride.g, + f_buf.byte_stride.g, + o_buf.byte_stride.g); + }; }; - }; + } + else + { + int unused = 0; + int N, C, H, W, K, out_H, out_W, R, S; + GetCompiledInParameters( + params, &C, &K, &R, &S, &N, &unused, &H, &W, &out_H, &out_W, &unused, &unused); + const auto group_cnt = params.group_counts; + static const int F_NKC_STRIDES = 1 << 9; + static const int F_GROUP_STRIDES = 1 << 10; + int flags = F_NKC_STRIDES + F_GROUP_STRIDES; + N = N / group_cnt; + K = K / group_cnt; + int pad_H = params.conv_problem.GetConv().GetConvPads()[0]; + int pad_W = params.conv_problem.GetConv().GetConvPads()[1]; + + BuffInfo d_buf( + GetGroupConvLayout(GetSwappedNCLayout(GetMemLayout_t(params.in_layout)), true), + N, + C, + H, + W, + group_cnt, + GetTypeSize(params.in_data_type)), + o_buf(GetGroupConvLayout(GetSwappedNCLayout(GetMemLayout_t(params.out_layout)), false), + N, + K, + out_H, + out_W, + group_cnt, + GetTypeSize(params.out_data_type)), + f_buf(GetGroupConvLayout(GetSwappedNCLayout(MemLayout_t::NCHW), true), + K, + C, + R, + S, + group_cnt, + GetTypeSize(params.weights_data_type)); + + decltype(auto) batch_sz = params.batch_sz; + decltype(auto) n_inputs = params.n_inputs; + + result.invoker_factory = [=](std::vector kernels) { + return [=](const Handle& handle, const AnyInvokeParams& primitive_params) { + decltype(auto) invoke_params = primitive_params.CastTo(); + const auto& tensors = invoke_params.tensors; + + // clang-format off + MIOPEN_LOG_I2(" N=" << N << " G=" << group_cnt << " C=" << C << " H=" << H << " W=" << W << " K=" << K + << " n_groups=" << n_groups << " flags=" << flags << " R=" << R << " S=" << S + << " pad_H=" << pad_H << " pad_W=" << pad_W << " out_H=" << out_H << " out_W=" << out_W + << " d_buf.byte_stride.nk=" << d_buf.byte_stride.nk << " d_buf.byte_stride.c=" << d_buf.byte_stride.c + << " d_buf.byte_stride.h=" << d_buf.byte_stride.h << " d_buf.byte_stride.w=" << d_buf.byte_stride.w + << " f_buf.byte_stride.nk=" << f_buf.byte_stride.nk << " f_buf.byte_stride.c=" << f_buf.byte_stride.c + << " f_buf.byte_stride.h=" << f_buf.byte_stride.h << " f_buf.byte_stride.w=" << f_buf.byte_stride.w + << " o_buf.byte_stride.nk=" << o_buf.byte_stride.nk << " o_buf.byte_stride.c=" << o_buf.byte_stride.c + << " o_buf.byte_stride.h=" << o_buf.byte_stride.h << " o_buf.byte_stride.w=" << o_buf.byte_stride.w + << " d_buf.byte_stride.g=" << d_buf.byte_stride.g << " o_buf.byte_stride.g=" << o_buf.byte_stride.g + << " f_buf.byte_stride.g=" << f_buf.byte_stride.g); // clang-format on + MIOPEN_LOG_I2(" ctx.batch_sz=" << batch_sz << "ctx.n_inputs=" << n_inputs); + + int reserved = 0; + uint64_t reserved_offset = 0; + int* reserved_ptr = nullptr; + + handle.Run(kernels[0])(N, + C, + H, + W, + K, + n_groups, + flags, + reserved, + tensors.x, + tensors.dy, + tensors.dw, + reserved_ptr, // Unused return_addr. + R, + S, + pad_H, + pad_W, + out_H, + out_W, + reserved_ptr, // Unused bias_addr. + reserved, // Unused relu_alpha. + reserved, // Unused reserved2. + reserved_offset, // Unused d_offset. + reserved_offset, // Unused f_offset. + reserved_offset, // Unused o_offset. + reserved_offset, // Unused b_offset. + d_buf.byte_stride.nk, + d_buf.byte_stride.c, + d_buf.byte_stride.h, + d_buf.byte_stride.w, + f_buf.byte_stride.nk, + f_buf.byte_stride.c, + f_buf.byte_stride.h, + f_buf.byte_stride.w, + o_buf.byte_stride.nk, + o_buf.byte_stride.c, + o_buf.byte_stride.h, + o_buf.byte_stride.w, + group_cnt, + d_buf.byte_stride.g, + f_buf.byte_stride.g, + o_buf.byte_stride.g); + }; + }; + } return result; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c948e799ce..bedc8f50df 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -87,6 +87,7 @@ elseif(MIOPEN_TEST_INT8) elseif(MIOPEN_TEST_BFLOAT16) set(MIOPEN_TEST_FLOAT_ARG --bfloat16) else() + set(MIOPEN_TEST_FLOAT_ARG --float) set(MIOPEN_TEST_FLOAT TRUE) endif() @@ -293,13 +294,13 @@ function(option_support_check is_anabled is_disabled default_result result) endfunction() # The add_custom_test function contains options to describe the conditions, -# under which new custom_tests should be run. Options are divided into several types. +# under which new custom_tests should be run. Options are divided into several types. # The option can be enabled or disabled, if nothing is specified, the default value is taken. -# You can use any number of options, provided that options do not conflict +# You can use any number of options, provided that options do not conflict # (e.g. "HALF_ENABLE HALF_DISABLE" is illegal) # 1)First describes supported data type. ( HALF BF16 INT8 FLOAT ...) # The option can be enabled or disabled by using '_ENABLED' and '_DISABLED' suffix. -# If nothing is specified, the default value is taken. +# If nothing is specified, the default value is taken. # Default: HALF=disabled, BF16=disabled, INT8=disabled, FLOAT=enabled. # 2)Second options type describes support GPU types (gfx900, gfx906, gfx908 ...) # The option can be enabled or disabled by using '_ENABLED' and '_DISABLED' suffix. @@ -319,7 +320,7 @@ endfunction() # Default: OCL=enabled, HIP=enabled, HIP_NOGPU=disabled. function(add_custom_test NAME) - set(options + set(options BF16_ENABLED BF16_DISABLED HALF_ENABLED HALF_DISABLED INT8_ENABLED INT8_DISABLED FLOAT_ENABLED FLOAT_DISABLED VEGA_ENABLED VEGA_DISABLED GFX908_ENABLED GFX908_DISABLED MIOTENSILE_ENABLED MIOTENSILE_DISABLED MLIR_ENABLED MLIR_DISABLED @@ -337,7 +338,7 @@ function(add_custom_test NAME) set(HALF_TEST_DEFAULT FALSE) option_support_check(${PARSE_HALF_ENABLED} ${PARSE_HALF_DISABLED} ${HALF_TEST_DEFAULT} is_half_check) bool_and_f(${MIOPEN_TEST_HALF} ${is_half_check} is_half_check) - + set(is_bfloat16_check) set(BF16_TEST_DEFAULT FALSE) option_support_check(${PARSE_BF16_ENABLED} ${PARSE_BF16_DISABLED} ${BF16_TEST_DEFAULT} is_bfloat16_check) @@ -352,7 +353,7 @@ function(add_custom_test NAME) set(FLOAT_TEST_DEFAULT TRUE) option_support_check(${PARSE_FLOAT_ENABLED} ${PARSE_FLOAT_DISABLED} ${FLOAT_TEST_DEFAULT} is_float_check) bool_and_f(${MIOPEN_TEST_FLOAT} ${is_float_check} is_float_check) - + set(is_miotensile_check) set(MIOTENSILE_TEST_DEFAULT FALSE) option_support_check(${PARSE_MIOTENSILE_ENABLED} ${PARSE_MIOTENSILE_DISABLED} ${MIOTENSILE_TEST_DEFAULT} is_miotensile_check) @@ -376,7 +377,7 @@ function(add_custom_test NAME) option_support_check(${PARSE_HIP_ENABLED} ${PARSE_HIP_DISABLED} ${HIP_TEST_DEFAULT} is_hip_check) bool_not_f(${MIOPEN_TEST_HIP} NOT_MIOPEN_TEST_HIP) bool_or_f(${NOT_MIOPEN_TEST_HIP} ${is_hip_check} is_hip_check) - + set(is_hip_nogpu_check) set(HIP_NOGPU_TEST_DEFAULT FALSE) option_support_check(${PARSE_HIP_NOGPU_ENABLED} ${PARSE_HIP_NOGPU_DISABLED} ${HIP_NOGPU_TEST_DEFAULT} is_hip_nogpu_check) @@ -402,7 +403,7 @@ function(add_custom_test NAME) add_custom_target(${NAME} ${PARSE_UNPARSED_ARGUMENTS}) add_test(NAME ${NAME} COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_BINARY_DIR} --target ${NAME}) if( (is_vega_check OR is_gfx908_check) - AND is_full_check + AND is_full_check AND (is_miotensile_check AND is_mlir_check) AND ( is_half_check OR is_bfloat16_check OR is_int8_check OR is_float_check) AND (is_ocl_check AND is_hip_check AND is_hip_nogpu_check) @@ -428,7 +429,7 @@ if(MIOPEN_EMBED_DB) set(MIOPEN_WA_ISSUE_874_F MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1=0) set(MIOPEN_WA_ISSUE_874_W MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1=0) set(MIOPEN_WA_ISSUE_874_FW MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1=0 MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1=0) -add_custom_test(test_conv_embed_db TEST_PERF_DB_RECORD_NOT_FOUND +add_custom_test(test_conv_embed_db TEST_PERF_DB_RECORD_NOT_FOUND COMMAND ${MIOPEN_WA_ISSUE_874_W} $ ${MIOPEN_EMBED_TEST_ARG} --input 128 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 COMMAND ${MIOPEN_WA_ISSUE_874_F} $ ${MIOPEN_EMBED_TEST_ARG} --input 128 1024 14 14 --weights 256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 COMMAND ${MIOPEN_WA_ISSUE_874_W} $ ${MIOPEN_EMBED_TEST_ARG} --input 128 1024 14 14 --weights 512 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 @@ -462,7 +463,7 @@ if(MIOPEN_TEST_MLIR) set(IMPLICITGEMM_MLIR_ARGS_F ${IMPLICITGEMM_ARGS} --verbose --disable-backward-data --disable-backward-weights) set(IMPLICITGEMM_MLIR_ARGS_B ${IMPLICITGEMM_ARGS} --verbose --disable-forward --disable-backward-weights) set(IMPLICITGEMM_MLIR_ARGS_W ${IMPLICITGEMM_ARGS} --verbose --disable-forward --disable-backward-data) - + add_custom_test(test_conv_igemm_mlir HALF_ENABLED MLIR_ENABLED COMMAND ${IMPLICITGEMM_MLIR_ENV_F} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 COMMAND ${IMPLICITGEMM_MLIR_ENV_F} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC @@ -474,6 +475,16 @@ if(MIOPEN_TEST_MLIR) COMMAND ${IMPLICITGEMM_MLIR_ENV_F} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${IMPLICITGEMM_MLIR_ENV_F} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 256 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 128 28 28 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 128 28 28 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 256 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4 + COMMAND ${IMPLICITGEMM_MLIR_ENV_W} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 64 1024 14 14 --weights 256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 COMMAND ${IMPLICITGEMM_MLIR_ENV_W} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 64 1024 14 14 --weights 256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${IMPLICITGEMM_MLIR_ENV_W} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 256 256 14 14 --weights 256 256 3 3 --pads_strides_dilations 0 0 2 2 1 1 @@ -504,6 +515,16 @@ if(MIOPEN_TEST_MLIR) COMMAND ${IMPLICITGEMM_MLIR_ENV_F_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${IMPLICITGEMM_MLIR_ENV_F_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_F} --input 256 256 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 128 28 28 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 128 28 28 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC + COMMAND ${IMPLICITGEMM_MLIR_ENV_B_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_B} --input 256 256 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --group-count 4 + COMMAND ${IMPLICITGEMM_MLIR_ENV_W_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 64 1024 14 14 --weights 256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 COMMAND ${IMPLICITGEMM_MLIR_ENV_W_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 64 1024 14 14 --weights 256 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC COMMAND ${IMPLICITGEMM_MLIR_ENV_W_XDLOPS} $ ${IMPLICITGEMM_MLIR_ARGS_W} --input 256 256 14 14 --weights 256 256 3 3 --pads_strides_dilations 0 0 2 2 1 1 @@ -888,7 +909,6 @@ set(DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS ${DYNAMIC_IMPLICITGEMM_COMMON} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC) -if(NOT (NOT MIOPEN_TEST_FLOAT OR MIOPEN_TEST_GFX908)) add_custom_test(test_conv_igemm_dynamic_small COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 16 16 56 56 --weights 64 16 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 16 64 34 34 --weights 64 64 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights @@ -899,7 +919,8 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose -- COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights ) -add_custom_test(test_conv_igemm_dynamic SKIP_UNLESS_ALL + +add_custom_test(test_conv_igemm_dynamic SKIP_UNLESS_ALL COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 64 256 34 34 --weights 256 256 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights @@ -918,97 +939,101 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose -- COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 128 256 56 56 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights ) -endif() -if(MIOPEN_TEST_GFX908) add_custom_test(test_conv_igemm_dynamic_xdlops_bwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 64 224 17 17 --weights 224 224 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 128 128 35 35 --weights 256 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 128 128 64 64 --weights 256 128 3 3 --pads_strides_dilations 1 1 2 2 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 128 768 17 17 --weights 256 768 3 3 --pads_strides_dilations 1 1 1 1 2 2 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 3 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 2 256 12 18 --weights 256 256 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 4 512 128 128 --weights 12 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ --verbose --input 8 16 5 5 --weights 8 16 2 2 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 224 17 17 --weights 224 224 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 35 35 --weights 256 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 64 64 --weights 256 128 3 3 --pads_strides_dilations 1 1 2 2 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 768 17 17 --weights 256 768 3 3 --pads_strides_dilations 1 1 1 1 2 2 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 3 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 256 12 18 --weights 256 256 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights +# WORKAROUND_ISSUE_995 +# COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 4 512 128 128 --weights 12 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 8 16 5 5 --weights 8 16 2 2 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights ) add_custom_test(test_conv_igemm_dynamic_xdlops_fwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 1024 14 14 --weights 1024 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 256 56 56 --weights 512 256 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 2048 7 7 --weights 2048 2048 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 1024 14 14 --weights 1024 1024 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 256 56 56 --weights 512 256 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 2048 7 7 --weights 2048 2048 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights # WORKAROUND_ISSUE_954 -# COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 3 224 224 --weights 64 3 7 7 --pads_strides_dilations 3 3 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 128 128 17 17 --weights 128 128 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 128 128 17 17 --weights 128 128 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 128 192 17 17 --weights 320 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 128 256 35 35 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 128 48 35 35 --weights 64 48 5 5 --pads_strides_dilations 2 2 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights +# COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 3 224 224 --weights 64 3 7 7 --pads_strides_dilations 3 3 2 2 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 17 17 --weights 128 128 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 17 17 --weights 128 128 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 192 17 17 --weights 320 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 256 35 35 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 48 35 35 --weights 64 48 5 5 --pads_strides_dilations 2 2 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 512 7 7 --weights 512 512 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights # WORKAROUND_ISSUE_954 -# COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 64 3 230 230 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 32 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 2 256 100 104 --weights 12 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ --verbose --input 1 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights +# COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 3 230 230 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 1024 14 14 --weights 2048 1024 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 256 100 104 --weights 12 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights +COMMAND ${DYNAMIC_IMPLICITGEMM_FWD_GTC_DYNAMIC_XDLOPS_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights ) -add_custom_test(test_conv_igemm_dynamic_xdlops_wrw SKIP_UNLESS_ALL SKIP_UNLESS_ALL GFX908_ENABLED VEGA_DISABLED HALF_ENABLED -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 64 224 17 17 --weights 224 224 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 128 128 35 35 --weights 256 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 128 128 64 64 --weights 256 128 3 3 --pads_strides_dilations 1 1 2 2 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 128 768 17 17 --weights 256 768 3 3 --pads_strides_dilations 1 1 1 1 2 2 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 3 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 2 256 12 18 --weights 256 256 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 4 512 128 128 --weights 12 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -# some single batch cases -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 3 32 32 --weights 1 3 11 11 --pads_strides_dilations 1 1 2 2 2 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 3 224 224 --weights 1 3 3 3 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 1 8 8 --weights 1 1 2 2 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --verbose --half --input 1 128 56 56 --weights 1 128 5 5 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data + +add_custom_test(test_conv_igemm_dynamic_xdlops_wrw SKIP_UNLESS_ALL GFX908_ENABLED VEGA_DISABLED HALF_ENABLED +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 224 17 17 --weights 224 224 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 35 35 --weights 256 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 128 64 64 --weights 256 128 3 3 --pads_strides_dilations 1 1 2 2 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 128 768 17 17 --weights 256 768 3 3 --pads_strides_dilations 1 1 1 1 2 2 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 3 256 28 28 --weights 80 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 256 12 18 --weights 256 256 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data +COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 4 512 128 128 --weights 12 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_996 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 7 7 --weights 1024 256 7 7 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_996 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 400 256 1 1 --weights 1024 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_995 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 3 32 32 --weights 1 3 11 11 --pads_strides_dilations 1 1 2 2 2 1 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_995 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 3 224 224 --weights 1 3 3 3 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_995 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 1 8 8 --weights 1 1 2 2 --pads_strides_dilations 0 0 1 1 2 2 --disable-forward --disable-backward-data +# WORKAROUND_ISSUE_995 +# COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 1 128 56 56 --weights 1 128 5 5 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data ) + add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_fwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 2 64 59 57 --weights 12 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ --verbose --input 64 3 78 78 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 64 59 57 --weights 12 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_FWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 3 78 78 --weights 64 3 7 7 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC ) add_custom_test(test_conv_igemm_dynamic_xdlops_nhwc_bwd SKIP_UNLESS_ALL HALF_ENABLED GFX908_ENABLED VEGA_DISABLED -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 2 64 32 28 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC -COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ --verbose --input 16 16 25 25 --weights 64 16 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 256 7 7 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 160 73 73 --weights 64 160 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 256 40 52 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 2 64 32 28 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 128 14 14 --weights 64 128 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 17 17 --weights 192 64 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 17 17 --weights 192 64 7 1 --pads_strides_dilations 3 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 4 128 28 28 --weights 128 128 2 2 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 32 128 8 8 --weights 192 128 3 1 --pads_strides_dilations 1 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 192 17 17 --weights 160 192 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 32 73 73 --weights 64 32 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 64 56 56 --weights 64 64 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC +COMMAND ${DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 16 16 25 25 --weights 64 16 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --in_layout NHWC --fil_layout NHWC --out_layout NHWC ) -endif() - if(MIOPEN_TEST_DEEPBENCH) add_custom_test(test_deepbench_conv MIOTENSILE_ENABLED COMMAND $ --verbose --input 4 1 161 700 --weights 32 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 @@ -1096,7 +1121,8 @@ if(MIOPEN_TEST_CONV) endif() if(MIOPEN_TEST_FLOAT) - add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND $ --double --all --verbose) +# WORKAROUND_SWDEV_291479 +# add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND $ --double --all --verbose) endif() # Add here regression tests that should be run only on Vega10/20 and only with FP16. @@ -1104,6 +1130,6 @@ add_custom_test(test_regression_half_vega FLOAT_DISABLED HALF_ENABLED GFX908_DIS # REGRESSION TEST for issue #894. # Can't be enabled for GFX908 due to WORKAROUND_ISSUE_2298 # Can't be enabled for GFX10 due to WORKAROUND_SWDEV_271887 -COMMAND MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvOclDirectFwd1x1 $ --verbose --disable-backward-data --disable-backward-weights --disable-verification-cache - --half --cmode conv --pmode default --group-count 1 --input 1 16 7 7 --weights 16 16 1 1 --pads_strides_dilations 0 0 1 1 1 1 +COMMAND MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvOclDirectFwd1x1 $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --disable-backward-data --disable-backward-weights --disable-verification-cache + --cmode conv --pmode default --group-count 1 --input 1 16 7 7 --weights 16 16 1 1 --pads_strides_dilations 0 0 1 1 1 1 ) diff --git a/test/network_data.hpp b/test/network_data.hpp index 367e8140d9..4dd6b56fd6 100644 --- a/test/network_data.hpp +++ b/test/network_data.hpp @@ -319,7 +319,15 @@ get_bn_spatial_inputs(int n = MIOPEN_TEST_DEFAULT_BATCH_SIZE_FACTOR) { pick_batch_size(32, n), 480, 128, 256 }, { pick_batch_size(32, n), 528, 64, 128 }, { pick_batch_size(770, n), 1, 8, 8 }, - { pick_batch_size(770, n), 1024, 1, 1 } + { pick_batch_size(770, n), 1024, 1, 1 }, + { pick_batch_size(152, n), 128, 80, 80 }, + { pick_batch_size(152, n), 256, 20, 20 }, + { pick_batch_size(152, n), 32, 160, 160 }, + { pick_batch_size(152, n), 512, 20, 20 }, + { pick_batch_size(152, n), 64, 160, 160 }, + { pick_batch_size(152, n), 64, 80, 80 }, + { pick_batch_size(256, n), 256, 20, 20 }, + { pick_batch_size(256, n), 512, 20, 20 } }; // clang-format on }