diff --git a/.github/workflows/auto-release.yml b/.github/workflows/auto-release.yml new file mode 100644 index 0000000000..0674fa4772 --- /dev/null +++ b/.github/workflows/auto-release.yml @@ -0,0 +1,115 @@ +name: Automatic release + +on: + push: + tags: + - 'v*' + - 'sv*' + +jobs: + create-rel: + name: BLIS release ${{ github.ref }} + runs-on: ubuntu-latest + steps: + - name: Create GH release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Automatic release. + body: | + Automatically generated by GitHub actions. + draft: false + prerelease: true + outputs: + upload-url: ${{ steps.create_release.outputs.upload_url }} + + build-test-upload: + name: BLIS ${{ matrix.target.config }} - ${{ github.event_name }} + needs: create-rel + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + target: + - { + config: 'intel64', + cc: 'gcc', + cflags: '', + packages: 'gcc binutils' + } + - { + config: 'amd64', + cc: 'gcc', + cflags: '', + packages: 'gcc binutils' + } + # These Aarch64 configs would be merged after adding autodetection. + - { + config: 'cortexa57', + cc: 'aarch64-linux-gnu-gcc', + cflags: '-mno-outline-atomics', + packages: 'gcc-aarch64-linux-gnu libc6-dev-arm64-cross' + } + - { + config: 'armsve', + cc: 'aarch64-linux-gnu-gcc-10', + cflags: '-mno-outline-atomics', + packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross' + } + - { + config: 'a64fx', + cc: 'aarch64-linux-gnu-gcc-10', + cflags: '"-mno-outline-atomics -DCACHE_SECTOR_SIZE_READONLY"', + packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross' + } + + steps: + - uses: actions/checkout@v2 + + - name: Install build dependencies + run: | + sudo apt update + sudo apt install -y ${{ matrix.target.packages }} + ${{ matrix.target.cc }} --version + + - name: Configure project + run: | + mkdir dest + ./configure -t none -p ./dest \ + CC=${{ matrix.target.cc }} \ + CFLAGS=${{ matrix.target.cflags }} \ + ${{ matrix.target.config }} + + - name: Build + run: | + make -j + + - name: Quick check + if: startsWith(matrix.target.config, 'x86') + run: | + make -j checkblis-fast + + - name: Install + run: | + make install + + - name: Packup + run: | + cd dest + tar -zcvf ../libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz \ + include lib + cd .. + + - name: Upload GH release asset + id: upload-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.create-rel.outputs.upload-url }} + asset_path: ./libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz + asset_name: libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz + asset_content_type: application/x-tar diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c index 5061570f80..22ef4a944f 100644 --- a/config/a64fx/bli_cntx_init_a64fx.c +++ b/config/a64fx/bli_cntx_init_a64fx.c @@ -49,9 +49,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 2, - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + 4, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, cntx ); @@ -67,11 +69,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 16, 8 ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, 10, 10 ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 192, 96 ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, 1536, 1536 ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. @@ -86,57 +88,6 @@ void bli_cntx_init_a64fx( cntx_t* cntx ) cntx ); -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif - // Set A64FX cache sector sizes for each PE/CMG // SC Fugaku might disable users' setting cache sizes. #if !defined(CACHE_SECTOR_SIZE_READONLY) diff --git a/config/armsve/bli_armsve_config_utils.c b/config/armsve/bli_armsve_config_utils.c index fdddeebabe..70501e39db 100644 --- a/config/armsve/bli_armsve_config_utils.c +++ b/config/armsve/bli_armsve_config_utils.c @@ -89,4 +89,6 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \ EXPANDMAC_BLKSZ_ARMSVE( s, 4 ) EXPANDMAC_BLKSZ_ARMSVE( d, 8 ) +EXPANDMAC_BLKSZ_ARMSVE( c, 8 ) +EXPANDMAC_BLKSZ_ARMSVE( z, 16 ) diff --git a/config/armsve/bli_armsve_config_utils.h b/config/armsve/bli_armsve_config_utils.h index 07aa9ba7d2..87bba73ed5 100644 --- a/config/armsve/bli_armsve_config_utils.h +++ b/config/armsve/bli_armsve_config_utils.h @@ -39,4 +39,6 @@ dim_t bli_vl_bits_armsve(void); void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); +void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); +void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c index 434979f915..6d4d356d62 100644 --- a/config/armsve/bli_cntx_init_armsve.c +++ b/config/armsve/bli_cntx_init_armsve.c @@ -50,17 +50,23 @@ void bli_cntx_init_armsve( cntx_t* cntx ) // Block size. dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s; dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d; + dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c; + dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z; bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s); bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d); + bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c); + bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z); // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( - 2, + 4, // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. - BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, - BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, + BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, cntx ); @@ -84,11 +90,11 @@ void bli_cntx_init_armsve( cntx_t* cntx ) // Initialize level-3 blocksize objects with architecture-specific values. // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, -1, -1 ); + bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z ); + bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z ); + bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z ); + bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z ); + bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. @@ -103,55 +109,5 @@ void bli_cntx_init_armsve( cntx_t* cntx ) cntx ); -#if 0 - // Initialize sup thresholds with architecture-appropriate values. - // s d c z - bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); - bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); - - // Initialize the context with the sup thresholds. - bli_cntx_set_l3_sup_thresh - ( - 3, - BLIS_MT, &thresh[ BLIS_MT ], - BLIS_NT, &thresh[ BLIS_NT ], - BLIS_KT, &thresh[ BLIS_KT ], - cntx - ); - - // Update the context with optimized small/unpacked gemm kernels. - bli_cntx_set_l3_sup_kers - ( - 4, - BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, - cntx - ); - - // Initialize level-3 sup blocksize objects with architecture-specific - // values. - // s d c z - bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); - bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); - - // Update the context with the current architecture's register and cache - // blocksizes for small/unpacked level-3 problems. - bli_cntx_set_l3_sup_blkszs - ( - 5, - BLIS_NC, &blkszs[ BLIS_NC ], - BLIS_KC, &blkszs[ BLIS_KC ], - BLIS_MC, &blkszs[ BLIS_MC ], - BLIS_NR, &blkszs[ BLIS_NR ], - BLIS_MR, &blkszs[ BLIS_MR ], - cntx - ); -#endif } diff --git a/docs/Performance.md b/docs/Performance.md index 051be7aea9..f4992d1dee 100644 --- a/docs/Performance.md +++ b/docs/Performance.md @@ -550,9 +550,9 @@ The `runthese.m` file will contain example invocations of the function. * Operating system: RHEL 8.3 * Page size: 256 bytes * Compiler: gcc 10.1.0 -* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021 +* Results gathered: 2 April 2021; BLIS and SSL2 updated on 21 Sept 2021 * Implementations tested: - * BLIS 61584de (post-0.8.1) + * BLIS b05279d (post-0.8.1) * configured with: * `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded) * `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded) @@ -574,7 +574,7 @@ The `runthese.m` file will contain example invocations of the function. * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12` * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48` * **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether. - * Fujitsu SSL2 (Fujitsu toolchain 1.2.31) + * Fujitsu SSL2 (Fujitsu toolchain 1.2.33) * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1` * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12` * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48` diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf index e273d1d098..4d27944170 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf and b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf differ diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png index 1316647d65..f51548effb 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png and b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png differ diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf index b311e0f5db..845dfaf862 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf and b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf differ diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png index c2719da87a..08e46c6723 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png and b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png differ diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.pdf b/docs/graphs/large/l3_perf_a64fx_nt1.pdf index 6f0b8c74fc..97a31560a1 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_nt1.pdf and b/docs/graphs/large/l3_perf_a64fx_nt1.pdf differ diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.png b/docs/graphs/large/l3_perf_a64fx_nt1.png index f2cb381786..0b7c2d72aa 100644 Binary files a/docs/graphs/large/l3_perf_a64fx_nt1.png and b/docs/graphs/large/l3_perf_a64fx_nt1.png differ diff --git a/kernels/armsve/3/armsve_asm_2vx10cmplx.h b/kernels/armsve/3/armsve_asm_2vx10cmplx.h new file mode 100644 index 0000000000..1b67d0d169 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_2vx10cmplx.h @@ -0,0 +1,130 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ + GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ + GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ + GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ + \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV4,BAddr,0) \ + GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV5,BAddr,2) \ + GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV6,BAddr,4) \ + GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV7,BAddr,6) \ + GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV0,BAddr,8) \ + GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV1,BAddr,10) \ + GEMM_FMLX2_LD1R(C8Im,C8Re,PT,AColRe,AColIm,BV2,BAddr,12) \ + GEMM_FMLX2_LD1R(C9Im,C9Re,PT,AColRe,AColIm,BV3,BAddr,14) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ + GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ + GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ + GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ + \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV4) \ + GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV5) \ + GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV6) \ + GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV7) \ + GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV0) \ + GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV1) \ + GEMM_FMLX2(C8Im,C8Re,PT,AColRe,AColIm,BV2) \ + GEMM_FMLX2(C9Im,C9Re,PT,AColRe,AColIm,BV3) + +#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ + GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) + +#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \ + CLEAR_COL4(Z00,Z01,Z02,Z03) \ + CLEAR_COL4(Z04,Z05,Z06,Z07) \ + CLEAR_COL4(Z08,Z09,Z10,Z11) \ + CLEAR_COL4(Z12,Z13,Z14,Z15) \ + CLEAR_COL4(Z16,Z17,Z18,Z19) + +// Moving is always .d. +// Never use .DT here! +#define MOV_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,Z0Re,Z0Im,Z1Re,Z1Im) \ +" mov "#ZD0Re".d, "#Z0Re".d \n\t" \ +" mov "#ZD0Im".d, "#Z0Im".d \n\t" \ +" mov "#ZD1Re".d, "#Z1Re".d \n\t" \ +" mov "#ZD1Im".d, "#Z1Im".d \n\t" + +#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ + FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ + GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ + GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) + +#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) + +#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + +#define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + diff --git a/kernels/armsve/3/armsve_asm_2vx7cmplx.h b/kernels/armsve/3/armsve_asm_2vx7cmplx.h new file mode 100644 index 0000000000..43997deef4 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_2vx7cmplx.h @@ -0,0 +1,135 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,B0Re,BAddr,0) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,B1Re,BAddr,2) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,B2Re,BAddr,4) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,B3Re,BAddr,6) \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,B4Re,BAddr,8) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,B5Re,BAddr,10) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,B6Re,BAddr,12) \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,B0Im,BAddr,1) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,B1Im,BAddr,3) \ + GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,B2Im,BAddr,5) \ + GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,B3Im,BAddr,7) \ + GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,B4Im,BAddr,9) \ + GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,B5Im,BAddr,11) \ + GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,B6Im,BAddr,13) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" + +#define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \ + GEMM_FMLA2(C0Re,C0Im,PT,AColRe,AColIm,B0Re) \ + GEMM_FMLA2(C1Re,C1Im,PT,AColRe,AColIm,B1Re) \ + GEMM_FMLA2(C2Re,C2Im,PT,AColRe,AColIm,B2Re) \ + GEMM_FMLA2(C3Re,C3Im,PT,AColRe,AColIm,B3Re) \ + GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,B4Re) \ + GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,B5Re) \ + GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,B6Re) \ + GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,B0Im) \ + GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,B1Im) \ + GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,B2Im) \ + GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,B3Im) \ + GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,B4Im) \ + GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,B5Im) \ + GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,B6Im) + +#define CLEAR_COL14(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13) \ + CLEAR_COL4(Z00,Z01,Z02,Z03) \ + CLEAR_COL4(Z04,Z05,Z06,Z07) \ + CLEAR_COL4(Z08,Z09,Z10,Z11) \ + CLEAR_COL2(Z12,Z13) + +#define GEMM_FMULCMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \ + FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ + FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ + FMUL_COL2(ZD2Re,ZD2Im,Z2Re,Z2Im,ZFactorRe) \ + FMUL_COL2(ZD3Re,ZD3Im,Z3Re,Z3Im,ZFactorRe) \ + FMUL_COL2(ZD4Re,ZD4Im,Z4Re,Z4Im,ZFactorRe) \ + FMUL_COL2(ZD5Re,ZD5Im,Z5Re,Z5Im,ZFactorRe) \ + FMUL_COL2(ZD6Re,ZD6Im,Z6Re,Z6Im,ZFactorRe) \ + GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ + GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) \ + GEMM_FMLX2(ZD2Im,ZD2Re,PT,Z2Re,Z2Im,ZFactorIm) \ + GEMM_FMLX2(ZD3Im,ZD3Re,PT,Z3Re,Z3Im,ZFactorIm) \ + GEMM_FMLX2(ZD4Im,ZD4Re,PT,Z4Re,Z4Im,ZFactorIm) \ + GEMM_FMLX2(ZD5Im,ZD5Re,PT,Z5Re,Z5Im,ZFactorIm) \ + GEMM_FMLX2(ZD6Im,ZD6Re,PT,Z6Re,Z6Im,ZFactorIm) + +#define GEMM_FMLACMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD2Re,ZD2Im,PT,Z2Re,Z2Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD3Re,ZD3Im,PT,Z3Re,Z3Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD4Re,ZD4Im,PT,Z4Re,Z4Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD5Re,ZD5Im,PT,Z5Re,Z5Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD6Re,ZD6Im,PT,Z6Re,Z6Im,ZFactorRe,ZFactorIm) + +#define GEMM_CCMPLX_LOAD_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z6Re,Z6Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_STORE_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z6Re,Z6Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_LOAD_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + +#define GEMM_CCMPLX_STORE_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + diff --git a/kernels/armsve/3/armsve_asm_2vx8cmplx.h b/kernels/armsve/3/armsve_asm_2vx8cmplx.h new file mode 100644 index 0000000000..16711930a4 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_2vx8cmplx.h @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,0) \ + GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,2) \ + GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,4) \ + GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,6) \ + \ + GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV8,BAddr,8) \ + GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV9,BAddr,10) \ + GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV10,BAddr,12) \ + GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV11,BAddr,14) \ + GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV0,BAddr,1) \ + GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV1,BAddr,3) \ + GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV2,BAddr,5) \ + GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV3,BAddr,7) + +#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ + GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BAddr,BRSBit) + +#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ + GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) + +#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ + GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \ + GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \ + GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \ + GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \ +" add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ + GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,BV4) \ + GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,BV5) \ + GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,BV6) \ + GEMM_FMLA2(C7Re,C7Im,PT,AColRe,AColIm,BV7) \ + \ + GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,BV8) \ + GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,BV9) \ + GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV10) \ + GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV11) \ + GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV0) \ + GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV1) \ + GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV2) \ + GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV3) + +#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ + GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) + +#define CLEAR_COL16(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15) \ + CLEAR_COL4(Z00,Z01,Z02,Z03) \ + CLEAR_COL4(Z04,Z05,Z06,Z07) \ + CLEAR_COL4(Z08,Z09,Z10,Z11) \ + CLEAR_COL4(Z12,Z13,Z14,Z15) + +#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ + FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ + GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ + GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) + +#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ + GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) + +#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ + GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) + +#define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + +#define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ + GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) + diff --git a/kernels/armsve/3/armsve_asm_macros_cmplx.h b/kernels/armsve/3/armsve_asm_macros_cmplx.h new file mode 100644 index 0000000000..10097700c8 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_cmplx.h @@ -0,0 +1,89 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "armsve_asm_macros.h" + +#define FMUL_COL2(ZD0,ZD1,Z0,Z1,ZFACTOR) \ +" fmul "#ZD0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \ +" fmul "#ZD1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \ + +#define GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ +" fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" \ +" fmls "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" + +#define GEMM_FMLX2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \ + GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ +" "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t" + +#define GEMM_FMULCMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \ + FMUL_COL2(ZDRe,ZDIm,Z0Re,Z0Im,Z1Re) \ + GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im) + +#define GEMM_FMLACMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \ + GEMM_FMLA2(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re) \ + GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im) + +#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \ +" "LD2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT"/z, ["#AAddr"] \n\t" + +#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \ +" "ST2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT", ["#AAddr"] \n\t" + +#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,AAddr,ACS) \ + GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \ +" add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */ + +#define GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) \ + GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) + +#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,AAddr,ACS) \ + GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \ +" add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */ + +#define GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) \ + GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) + +#define GEMM_CCOLCMPLX_GATHER_LOAD_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \ +" add "#CTemp", "#CAddr", #"SZ" \n\t" /* Imaginary skip */ \ +" "LD1" "#ZRe"."DT", "#PRe"/z, ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \ +" "LD1" "#ZIm"."DT", "#PRe"/z, ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \ +" add "#CAddr", "#CAddr", "#CCS" \n\t" + +#define GEMM_CCOLCMPLX_SCATTER_STORE_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \ +" add "#CTemp", "#CAddr", #"SZ" \n\t" /* Imaginary skip */ \ +" "ST1" "#ZRe"."DT", "#PRe", ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \ +" "ST1" "#ZIm"."DT", "#PRe", ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \ +" add "#CAddr", "#CAddr", "#CCS" \n\t" + diff --git a/kernels/armsve/3/armsve_asm_macros_dcomplex.h b/kernels/armsve/3/armsve_asm_macros_dcomplex.h new file mode 100644 index 0000000000..0beb5d2316 --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_dcomplex.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +// Specify to use double precision. +#define DT "d" +#define LD1 "ld1d" +#define ST1 "st1d" +#define LD2 "ld2d" +#define ST2 "st2d" +#define LD1R "ld1rd" +#define PRFG "prfd" +#define SZ "8" +#define OFFS "lsl #3" +// Include macros. +#include "armsve_asm_macros_cmplx.h" + diff --git a/kernels/armsve/3/armsve_asm_macros_scomplex.h b/kernels/armsve/3/armsve_asm_macros_scomplex.h new file mode 100644 index 0000000000..f49cfedfba --- /dev/null +++ b/kernels/armsve/3/armsve_asm_macros_scomplex.h @@ -0,0 +1,48 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +// Specify to use single precision. +#define DT "s" +#define LD1 "ld1w" +#define ST1 "st1w" +#define LD2 "ld2w" +#define ST2 "st2w" +#define LD1R "ld1rw" +#define PRFG "prfw" +#define SZ "4" +#define OFFS "uxtw #2" +// Include macros. +#include "armsve_asm_macros_cmplx.h" + diff --git a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c index 01bb644b12..7262ac0e39 100644 --- a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c +++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c @@ -60,6 +60,10 @@ void bli_dgemm_armsve256_asm_8x8 { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); + static int called = 0; + if (!called) + fprintf(stderr, "8x8 called\n"); + called = 1; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. @@ -70,27 +74,27 @@ void bli_dgemm_armsve256_asm_8x8 __asm__ volatile ( -" \n\t" -" ldr x0,%[aaddr] \n\t" // Load address of A -" ldr x1,%[baddr] \n\t" // Load address of B -" ldr x2,%[caddr] \n\t" // Load address of C -" \n\t" -" ldr x3,%[a_next] \n\t" // Move pointer -" ldr x4,%[b_next] \n\t" // Move pointer -" \n\t" -" ldr x5,%[k_iter] \n\t" // Init guard (k_iter) -" ldr x6,%[k_left] \n\t" // Init guard (k_iter) -" \n\t" -" ldr x7,%[alpha] \n\t" // Alpha address -" ldr x8,%[beta] \n\t" // Beta address -" \n\t" -" ldr x9,%[cs_c] \n\t" // Load cs_c -" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) -" \n\t" -" ldr x13,%[rs_c] \n\t" // Load rs_c. -" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). -" \n\t" -" add x20,x2,x10 \n\t" //Load address Column 1 of C +// " \n\t" +// " ldr x0,%[aaddr] \n\t" // Load address of A +// " ldr x1,%[baddr] \n\t" // Load address of B +// " ldr x2,%[caddr] \n\t" // Load address of C +// " \n\t" +// " ldr x3,%[a_next] \n\t" // Move pointer +// " ldr x4,%[b_next] \n\t" // Move pointer +// " \n\t" +// " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) +// " ldr x6,%[k_left] \n\t" // Init guard (k_iter) +// " \n\t" +// " ldr x7,%[alpha] \n\t" // Alpha address +// " ldr x8,%[beta] \n\t" // Beta address +// " \n\t" +// " ldr x9,%[cs_c] \n\t" // Load cs_c +// " ldr x13,%[rs_c] \n\t" // Load rs_c. +" \n\t" +" lsl x10,%9,#3 \n\t" // cs_c * sizeof(double) +" lsl x14,%10,#3 \n\t" // rs_c * sizeof(double). +" \n\t" +" add x20,%2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C " add x22,x21,x10 \n\t" //Load address Column 3 of C " add x23,x22,x10 \n\t" //Load address Column 4 of C @@ -98,7 +102,7 @@ __asm__ volatile " add x25,x24,x10 \n\t" //Load address Column 6 of C " add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" -" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[%2] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. " prfm pldl1keep,[x22] \n\t" // Prefetch c. @@ -107,39 +111,39 @@ __asm__ volatile " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" -" ldr z0, [x0] \n\t" // Load a -" ldr z1, [x0, #1, MUL VL] \n\t" +" ldr z0, [%0] \n\t" // Load a +" ldr z1, [%0, #1, MUL VL] \n\t" " \n\t" " ptrue p0.d, all \n\t" -" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" ld1rqd {z2.d}, p0/z, [%1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) " \n\t" -" \n\t" // PRFM, the following prefetch on [x1] and [x0] +" \n\t" // PRFM, the following prefetch on [%1] and [%0] " \n\t" // is for b rows 4..7 and a columns 4..7. " \n\t" // both of them will be used in next iteration " \n\t" // of k_iter (unrolled per 4 loops) " \n\t" " dup z16.d, #0 \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #256] \n\t" // prefetch b row no.4 +" prfm PLDL1KEEP, [%1, #256] \n\t" // prefetch b row no.4 " dup z17.d, #0 \n\t" // Vector for accummulating column 0 -" prfm PLDL1KEEP, [x1, #320] \n\t" // prefetch b row no.5 +" prfm PLDL1KEEP, [%1, #320] \n\t" // prefetch b row no.5 " dup z18.d, #0 \n\t" // Vector for accummulating column 1 -" prfm PLDL1KEEP, [x1, #384] \n\t" // prefetch b row no.6 +" prfm PLDL1KEEP, [%1, #384] \n\t" // prefetch b row no.6 " dup z19.d, #0 \n\t" // Vector for accummulating column 1 -" prfm PLDL1KEEP, [x1, #448] \n\t" // preftech b row no.7 +" prfm PLDL1KEEP, [%1, #448] \n\t" // preftech b row no.7 " dup z20.d, #0 \n\t" // Vector for accummulating column 2 " dup z21.d, #0 \n\t" // Vector for accummulating column 2 " \n\t" " dup z22.d, #0 \n\t" // Vector for accummulating column 3 -" prfm PLDL1KEEP, [x0, #256] \n\t" // prefetch a col. no.4 +" prfm PLDL1KEEP, [%0, #256] \n\t" // prefetch a col. no.4 " dup z23.d, #0 \n\t" // Vector for accummulating column 3 -" prfm PLDL1KEEP, [x0, #320] \n\t" // prefetch a col. no.5 +" prfm PLDL1KEEP, [%0, #320] \n\t" // prefetch a col. no.5 " dup z24.d, #0 \n\t" // Vector for accummulating column 4 -" prfm PLDL1KEEP, [x0, #384] \n\t" // prefetch a col. no.6 +" prfm PLDL1KEEP, [%0, #384] \n\t" // prefetch a col. no.6 " dup z25.d, #0 \n\t" // Vector for accummulating column 4 -" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.7 +" prfm PLDL1KEEP, [%0, #448] \n\t" // prefetch a col. no.7 " dup z26.d, #0 \n\t" // Vector for accummulating column 5 " dup z27.d, #0 \n\t" // Vector for accummulating column 5 " \n\t" @@ -149,157 +153,157 @@ __asm__ volatile " dup z31.d, #0 \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" -" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. +" cmp %5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .DCONSIDERKLEFT \n\t" " \n\t" -" add x0, x0, #64 \n\t" //update address of A -" add x1, x1, #64 \n\t" //update address of B +" add %0, %0, #64 \n\t" //update address of A +" add %1, %1, #64 \n\t" //update address of B " \n\t" -" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. +" cmp %5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .DLASTITER \n\t" // (as loop is do-while-like). " \n\t" " DLOOP: \n\t" // Body " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) -" prfm PLDL1KEEP, [x1, #448] \n\t" // prefetch b row no.8, 512-64=448 +" prfm PLDL1KEEP, [%1, #448] \n\t" // prefetch b row no.8, 512-64=448 " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) -" prfm PLDL1KEEP, [x1, #512] \n\t" // prefetch b row no.9 +" prfm PLDL1KEEP, [%1, #512] \n\t" // prefetch b row no.9 " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" prfm PLDL1KEEP, [x1, #576] \n\t" // prefetch b row no.10 +" prfm PLDL1KEEP, [%1, #576] \n\t" // prefetch b row no.10 " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) -" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" ldr z6, [%0] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) -" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z7, [%0, #1, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 1 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) -" prfm PLDL1KEEP, [x1, #640] \n\t" // prefetch b row no.11 +" prfm PLDL1KEEP, [%1, #640] \n\t" // prefetch b row no.11 " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) -" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.8 +" prfm PLDL1KEEP, [%0, #448] \n\t" // prefetch a col. no.8 " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" prfm PLDL1KEEP, [x0, #512] \n\t" // prefetch a col. no.9 +" prfm PLDL1KEEP, [%0, #512] \n\t" // prefetch a col. no.9 " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) -" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" ldr z0, [%0, #2, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) -" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z1, [%0, #3, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" " \n\t" //End it 2 " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) -" prfm PLDL1KEEP, [x0, #576] \n\t" // prefetch a col. no.10 +" prfm PLDL1KEEP, [%0, #576] \n\t" // prefetch a col. no.10 " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) -" prfm PLDL1KEEP, [x0, #640] \n\t" // prefetch a col. no.11 +" prfm PLDL1KEEP, [%0, #640] \n\t" // prefetch a col. no.11 " \n\t" " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " \n\t" -" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" add %1, %1, #128 \n\t" // because immediate in 'ldr1rqd' must be " \n\t" // in range -128 to 112 " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) -" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" ldr z6, [%0, #4, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) -" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z7, [%0, #5, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1, #0] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 3 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" ldr z0, [x0, #6, MUL VL] \n\t" // Load a( 0:3,l ) +" ldr z0, [%0, #6, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) -" ldr z1, [x0, #7, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z1, [%0, #7, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" //End it 4 -" add x0, x0, #256 \n\t" -" add x1, x1, #128 \n\t" +" add %0, %0, #256 \n\t" +" add %1, %1, #128 \n\t" " \n\t" -" sub x5,x5,1 \n\t" // i-=1 -" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. +" sub %5,%5,1 \n\t" // i-=1 +" cmp %5,1 \n\t" // Iterate again if we are not in k_iter == 1. " bne DLOOP \n\t" " \n\t" ".DLASTITER: \n\t" @@ -307,60 +311,60 @@ __asm__ volatile " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" ldr z6, [%0] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) -" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z7, [%0, #1, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 1 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" ldr z0, [%0, #2, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) -" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z1, [%0, #3, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" " \n\t" //End it 2 @@ -368,32 +372,32 @@ __asm__ volatile " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) -" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" ldr z6, [%0, #4, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) -" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" ldr z7, [%0, #5, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) -" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" add %1, %1, #128 \n\t" // because immediate in 'ldr1rqd' must be " \n\t" // in range -128 to 112 " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) -" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" ld1rqd {z2.d}, p0/z, [%1, #0] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 3 " \n\t" @@ -414,7 +418,7 @@ __asm__ volatile " \n\t" " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) -" add x1, x1, #64 \n\t" +" add %1, %1, #64 \n\t" " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) @@ -423,25 +427,25 @@ __asm__ volatile " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " \n\t" " \n\t" //End it 4 -" add x0, x0, #192 \n\t" +" add %0, %0, #192 \n\t" " \n\t" " .DCONSIDERKLEFT: \n\t" -" cmp x6,0 \n\t" // If k_left == 0, we are done. +" cmp %6,0 \n\t" // If k_left == 0, we are done. " beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" -" ldr z0, [x0] \n\t" // Load a -" ldr z1, [x0, #1, MUL VL] \n\t" -" add x0, x0, #64 \n\t" +" ldr z0, [%0] \n\t" // Load a +" ldr z1, [%0, #1, MUL VL] \n\t" +" add %0, %0, #64 \n\t" " \n\t" -" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) -" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) -" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) -" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) -" add x1, x1, #64 \n\t" +" ld1rqd {z2.d}, p0/z, [%1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [%1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [%1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [%1, #48] \n\t" // load b( l,6:7 ) +" add %1, %1, #64 \n\t" " \n\t" -" sub x6,x6,1 \n\t" +" sub %6,%6,1 \n\t" " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) @@ -467,15 +471,15 @@ __asm__ volatile " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " \n\t" -" cmp x6,0 \n\t" // Iterate again. +" cmp %6,0 \n\t" // Iterate again. " bne .DLOOPKLEFT \n\t" // if i!=0. " \n\t" " .DPOSTACCUM: \n\t" " \n\t" -" ld1rd {z6.d}, p0/z, [x7] \n\t" // Load alpha. -" ld1rd {z7.d}, p0/z, [x8] \n\t" // Load beta +" ld1rd {z6.d}, p0/z, [%7] \n\t" // Load alpha. +" ld1rd {z7.d}, p0/z, [%8] \n\t" // Load beta " \n\t" -" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" cmp %10,#1 \n\t" // If rs_c != 1 (column-major) " bne .DGENSTORED \n\t" " \n\t" " .DCOLSTORED: \n\t" // C is column-major. @@ -488,8 +492,8 @@ __asm__ volatile " fcmp d7,#0.0 \n\t" " beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" ldr z0, [x2] \n\t" //Load column 0 of C -" ldr z1, [x2, #1, MUL VL] \n\t" +" ldr z0, [%2] \n\t" //Load column 0 of C +" ldr z1, [%2, #1, MUL VL] \n\t" " \n\t" " ldr z2, [x20] \n\t" //Load column 1 of C " ldr z3, [x20, #1, MUL VL] \n\t" @@ -506,8 +510,8 @@ __asm__ volatile " fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha " \n\t" -" str z0, [x2] \n\t" //Store column 0 of C -" str z1, [x2, #1, MUL VL] \n\t" +" str z0, [%2] \n\t" //Store column 0 of C +" str z1, [%2, #1, MUL VL] \n\t" " \n\t" " str z2, [x20] \n\t" //Store column 1 of C " str z3, [x20, #1, MUL VL] \n\t" @@ -597,8 +601,8 @@ __asm__ volatile " \n\t" " .DBETAZEROCOLSTOREDS4: \n\t" " \n\t" -" prfm pldl2keep,[x3] \n\t" -" prfm pldl2keep,[x4] \n\t" +" prfm pldl2keep,[%3] \n\t" +" prfm pldl2keep,[%4] \n\t" " \n\t" " fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha " fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha @@ -624,12 +628,12 @@ __asm__ volatile " \n\t" // loading/storing from column of *c " \n\t" " \n\t" // C's each column's address: -" \n\t" // x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7) -" \n\t" // x5, x6, x7, x8, x16, x17, x18, x19: are addresses of c(4,0:7) -" add x5, x15, x2 \n\t" // x5 is address of c(4,0) -" add x6, x15, x20 \n\t" // x6 is address of c(4,1) -" add x7, x15, x21 \n\t" // x7 is address of c(4,2) -" add x8, x15, x22 \n\t" // x8 is address of c(4,3) +" \n\t" // %2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7) +" \n\t" // %5, %6, %7, %8, x16, x17, x18, x19: are addresses of c(4,0:7) +" add %5, x15, %2 \n\t" // %5 is address of c(4,0) +" add %6, x15, x20 \n\t" // %6 is address of c(4,1) +" add %7, x15, x21 \n\t" // %7 is address of c(4,2) +" add %8, x15, x22 \n\t" // %8 is address of c(4,3) " add x16, x15, x23 \n\t" // x16 is address of c(4,4) " add x17, x15, x24 \n\t" // x17 is address of c(4,5) " add x18, x15, x25 \n\t" // x18 is address of c(4,6) @@ -643,14 +647,14 @@ __asm__ volatile " fcmp d7,#0.0 \n\t" " beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" -" \n\t" // x2 is address of c(0,0) -" \n\t" // x5 is address of c(4,0) +" \n\t" // %2 is address of c(0,0) +" \n\t" // %5 is address of c(4,0) " \n\t" // x20 is address of c(0,1) -" \n\t" // x6 is address of c(4,1) -" ld1d {z0.d}, p0/z, [x2, z4.d] \n\t" // Load c( 0:3,0 ) into z0 -" ld1d {z1.d}, p0/z, [x5, z4.d] \n\t" // Load c( 4:7,0 ) into z1 +" \n\t" // %6 is address of c(4,1) +" ld1d {z0.d}, p0/z, [%2, z4.d] \n\t" // Load c( 0:3,0 ) into z0 +" ld1d {z1.d}, p0/z, [%5, z4.d] \n\t" // Load c( 4:7,0 ) into z1 " ld1d {z2.d}, p0/z, [x20, z4.d] \n\t" // Load c( 0:3,1 ) into z2 -" ld1d {z3.d}, p0/z, [x6 , z4.d] \n\t" // Load c( 4:7,1 ) into z3 +" ld1d {z3.d}, p0/z, [%6 , z4.d] \n\t" // Load c( 4:7,1 ) into z3 " \n\t" " fmul z0.d, z0.d, z7.d \n\t" // Scale by beta " fmul z1.d, z1.d, z7.d \n\t" // Scale by beta @@ -664,10 +668,10 @@ __asm__ volatile " fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha " \n\t" -" st1d {z0.d}, p0, [x2 , z4.d] \n\t" // Store c( 0:3,0 ) <- z0 -" st1d {z1.d}, p0, [x5 , z4.d] \n\t" // Store c( 4:7,0 ) <- z1 +" st1d {z0.d}, p0, [%2 , z4.d] \n\t" // Store c( 0:3,0 ) <- z0 +" st1d {z1.d}, p0, [%5 , z4.d] \n\t" // Store c( 4:7,0 ) <- z1 " st1d {z2.d}, p0, [x20, z4.d] \n\t" // Store c( 0:3,1 ) <- z2 -" st1d {z3.d}, p0, [x6 , z4.d] \n\t" // Store c( 4:7,1 ) <- z3 +" st1d {z3.d}, p0, [%6 , z4.d] \n\t" // Store c( 4:7,1 ) <- z3 " \n\t" " \n\t" " \n\t" @@ -680,13 +684,13 @@ __asm__ volatile " beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" " \n\t" // x21 is address of c(0,2) -" \n\t" // x7 is address of c(4,2) +" \n\t" // %7 is address of c(4,2) " \n\t" // x22 is address of c(0,3) -" \n\t" // x8 is address of c(4,3) +" \n\t" // %8 is address of c(4,3) " ld1d {z8.d}, p0/z, [x21, z4.d] \n\t" // Load c( 0:3,2 ) into z8 -" ld1d {z9.d}, p0/z, [x7 , z4.d] \n\t" // Load c( 4:7,2 ) into z9 +" ld1d {z9.d}, p0/z, [%7 , z4.d] \n\t" // Load c( 4:7,2 ) into z9 " ld1d {z10.d}, p0/z, [x22, z4.d] \n\t" // Load c( 0:3,3 ) into z10 -" ld1d {z11.d}, p0/z, [x8 , z4.d] \n\t" // Load c( 4:7,3 ) into z11 +" ld1d {z11.d}, p0/z, [%8 , z4.d] \n\t" // Load c( 4:7,3 ) into z11 " \n\t" " fmul z8.d, z8.d, z7.d \n\t" // Scale by beta " fmul z9.d, z9.d, z7.d \n\t" // Scale by beta @@ -701,9 +705,9 @@ __asm__ volatile " fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " st1d {z8.d}, p0, [x21, z4.d] \n\t" // Store c( 0:3,2 ) <- z8 -" st1d {z9.d}, p0, [x7 , z4.d] \n\t" // Store c( 4:7,2 ) <- z9 +" st1d {z9.d}, p0, [%7 , z4.d] \n\t" // Store c( 4:7,2 ) <- z9 " st1d {z10.d}, p0, [x22, z4.d] \n\t" // Store c( 0:3,3 ) <- z10 -" st1d {z11.d}, p0, [x8 , z4.d] \n\t" // Store c( 4:7,3 ) <- z11 +" st1d {z11.d}, p0, [%8 , z4.d] \n\t" // Store c( 4:7,3 ) <- z11 " \n\t" " dup z0.d, #0 \n\t" // C column 4, 5 " dup z1.d, #0 \n\t" @@ -775,24 +779,21 @@ __asm__ volatile " \n\t" " .DEND: \n\t" // Done! " \n\t" -:// output operands (none) -:// input operands - [aaddr] "m" (a), // 0 - [baddr] "m" (b), // 1 - [caddr] "m" (c), // 2 - [k_iter] "m" (k_iter), // 3 - [k_left] "m" (k_left), // 4 - [alpha] "m" (alpha), // 5 - [beta] "m" (beta), // 6 - [rs_c] "m" (rs_c), // 6 - [cs_c] "m" (cs_c), // 7 - [a_next] "m" (a_next), // 8 - [b_next] "m" (b_next) // 9 +:// input/output operands + "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (a_next), // %3 + "+r" (b_next), // %4 + "+r" (k_iter), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (cs_c), // %9 + "+r" (rs_c) // %10 +:// input-only operands (none) :// Register clobber list - "x0","x1","x2","x3", - "x4","x5","x6", - "x7","x8","x9", - "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19", + "x10","x11","x12","x14","x15","x16","x17","x18","x19", "x20","x21","x22","x23","x24","x25","x26", "x27", "v0","v1","v2", diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c new file mode 100644 index 0000000000..4df75c7691 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -0,0 +1,300 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Single-precision composite instructions. +#include "armsve_asm_macros_scomplex.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10cmplx.h" + +void bli_cgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + scomplex* restrict alpha, + scomplex* restrict a, + scomplex* restrict b, + scomplex* restrict beta, + scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t info = 0; + + __asm__ volatile ( +// " ldr x0, %[a] \n\t" +// " ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incw x2, ALL, MUL #1 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +// " ldr x2, %[c] \n\t" +// " ldr x3, %[rs_c] \n\t" // Row-skip of C. +// " ldr x4, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %0, %0, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %1, %1, x16 \n\t" +" mov x16, 0x3 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr %2, %2, x16 \n\t" +#endif +" \n\t" +" mov x16, #8 \n\t" // Multiply some address skips by sizeof(scomplex). +" madd x2, x16, x2, xzr \n\t" // cs_a +" madd x3, x16, x3, xzr \n\t" // rs_b +" madd %4, x16, %4, xzr \n\t" // cs_c +" ptrue p0.s \n\t" +" \n\t" +// " ldr x5, %[k_mker] \n\t" // Number of loops. +// " ldr x6, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp %5, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +" ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rw z21.s, p0/z, [%1, 4*2] \n\t" +" ld1rw z22.s, p0/z, [%1, 4*4] \n\t" +" ld1rw z23.s, p0/z, [%1, 4*6] \n\t" +" ld1rw z24.s, p0/z, [%1, 4*8] \n\t" +" ld1rw z25.s, p0/z, [%1, 4*10] \n\t" +" ld1rw z26.s, p0/z, [%1, 4*12] \n\t" +" ld1rw z27.s, p0/z, [%1, 4*14] \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp %3, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, %2 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp %5, #0 \n\t" // If no 4-microkernel can be applied. +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" subs %5, %5, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp %6, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rw z21.s, p0/z, [%1, 4*2] \n\t" +" ld1rw z22.s, p0/z, [%1, 4*4] \n\t" +" ld1rw z23.s, p0/z, [%1, 4*6] \n\t" +" ld1rw z24.s, p0/z, [%1, 4*8] \n\t" +" ld1rw z25.s, p0/z, [%1, 4*10] \n\t" +" ld1rw z26.s, p0/z, [%1, 4*12] \n\t" +" ld1rw z27.s, p0/z, [%1, 4*14] \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" sub %6, %6, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). +// " ldr x8, %[beta] \n\t" +" ld1rw z28.s, p0/z, [%7] \n\t" // Real(alpha). +" ld1rw z29.s, p0/z, [%7, 4] \n\t" // Imag(alpha). +" ld1rw z30.s, p0/z, [%8] \n\t" // Real(beta). +" ld1rw z31.s, p0/z, [%8, 4] \n\t" // Imag(beta). +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x9, %[a_next] \n\t" +// " ldr x10, %[b_next] \n\t" +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %9, %9, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %10, %10, x16 \n\t" +#endif +" prfm PLDL1STRM, [%9] \n\t" +" prfm PLDL1STRM, [%9, 256*1] \n\t" +" prfm PLDL1STRM, [%10] \n\t" +" prfm PLDL1STRM, [%10, 256*1] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" fmov s27, #1.0 \n\t" +" fcmp s29, #0.0 \n\t" // Whether Imag(alpha) == 0. +" fccmp s28, s27, 0, eq \n\t" // Whether Real(alpha) == 1. +" b.eq UNIT_ALPHA \n\t" +" \n\t" +GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) +GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) +GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) +GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) +GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) +" b WRITE_MEM_EXEC \n\t" +" \n\t" +" UNIT_ALPHA: \n\t" +MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) +MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) +MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) +MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) +MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) +" \n\t" +" WRITE_MEM_EXEC: \n\t" +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. +" cmp %3, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" +" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +" mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. +" index z28.s, wzr, w3 \n\t" +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_EXEC: \n\t" +" mov %11, #0 \n\t" // Return normal. +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (cs_c), // %4 + "+r" (k_mker), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (a_next), // %9 + "+r" (b_next), // %10 + "=r" (info) // %11 +: +: "x2","x3","x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c new file mode 100644 index 0000000000..90f212dbd1 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -0,0 +1,299 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Double-precision composite instructions. +#include "armsve_asm_macros_dcomplex.h" + +// 2vx10 microkernels. +#include "armsve_asm_2vx10cmplx.h" + +void bli_zgemm_armsve_asm_2vx10_unindexed + ( + dim_t k0, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t info = 0; + + __asm__ volatile ( +// " ldr x0, %[a] \n\t" +// " ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incd x2, ALL, MUL #1 \n\t" // Column-skip of A. +" mov x3, #10 \n\t" // Row-skip of B. +" \n\t" +// " ldr x2, %[c] \n\t" +// " ldr x3, %[rs_c] \n\t" // Row-skip of C. +// " ldr x4, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %0, %0, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %1, %1, x16 \n\t" +" mov x16, 0x3 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr %2, %2, x16 \n\t" +#endif +" \n\t" +" mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). +" madd x2, x16, x2, xzr \n\t" // cs_a +" madd x3, x16, x3, xzr \n\t" // rs_b +" madd %4, x16, %4, xzr \n\t" // cs_c +" ptrue p0.d \n\t" +" \n\t" +// " ldr x5, %[k_mker] \n\t" // Number of loops. +// " ldr x6, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp %5, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp %3, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, %2 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) +" \n\t" +" cmp %5, #0 \n\t" // If no 4-microkernel can be applied. +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" subs %5, %5, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp %6, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" sub %6, %6, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). +// " ldr x8, %[beta] \n\t" +" ld1rd z28.d, p0/z, [%7] \n\t" // Real(alpha). +" ld1rd z29.d, p0/z, [%7, 8] \n\t" // Imag(alpha). +" ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). +" ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x9, %[a_next] \n\t" +// " ldr x10, %[b_next] \n\t" +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %9, %9, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %10, %10, x16 \n\t" +#endif +" prfm PLDL1STRM, [%9] \n\t" +" prfm PLDL1STRM, [%9, 256*1] \n\t" +" prfm PLDL1STRM, [%10] \n\t" +" prfm PLDL1STRM, [%10, 256*1] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" fmov d27, #1.0 \n\t" +" fcmp d29, #0.0 \n\t" // Whether Imag(alpha) == 0. +" fccmp d28, d27, 0, eq \n\t" // Whether Real(alpha) == 1. +" b.eq UNIT_ALPHA \n\t" +" \n\t" +GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) +GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) +GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) +GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) +GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) +" b WRITE_MEM_EXEC \n\t" +" \n\t" +" UNIT_ALPHA: \n\t" +MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) +MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) +MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) +MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) +MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) +" \n\t" +" WRITE_MEM_EXEC: \n\t" +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. +" cmp %3, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" +" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +" index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_EXEC: \n\t" +" mov %11, #0 \n\t" // Return normal. +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (cs_c), // %4 + "+r" (k_mker), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (a_next), // %9 + "+r" (b_next), // %10 + "=r" (info) // %11 +: +: "x2","x3","x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c new file mode 100644 index 0000000000..3d25719d92 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c @@ -0,0 +1,266 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Double-precision composite instructions. +#include "armsve_asm_macros_dcomplex.h" + +// 2vx7 microkernels. +#include "armsve_asm_2vx7cmplx.h" + +void bli_zgemm_armsve_asm_2vx7_unindexed + ( + dim_t k0, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t info = 0; + + __asm__ volatile ( +// " ldr x0, %[a] \n\t" +// " ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incd x2, ALL, MUL #1 \n\t" // Column-skip of A. +" mov x3, #7 \n\t" // Row-skip of B. +" \n\t" +// " ldr x2, %[c] \n\t" +// " ldr x3, %[rs_c] \n\t" // Row-skip of C. +// " ldr x4, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %0, %0, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %1, %1, x16 \n\t" +" mov x16, 0x3 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr %2, %2, x16 \n\t" +#endif +" \n\t" +" mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). +" madd x2, x16, x2, xzr \n\t" // cs_a +" madd x3, x16, x3, xzr \n\t" // rs_b +" madd %4, x16, %4, xzr \n\t" // cs_c +" ptrue p0.d \n\t" +" \n\t" +// " ldr x5, %[k_mker] \n\t" // Number of loops. +// " ldr x6, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp %5, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" \n\t" +" ld1rd z14.d, p0/z, [%1, 8*0] \n\t" // Load B's real & imaginary. +" ld1rd z15.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z16.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z17.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z18.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z19.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z20.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z21.d, p0/z, [%1, 8*1] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*3] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*5] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*7] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*9] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*11] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*13] \n\t" +" add %1, %1, x3 \n\t" +" \n\t" +" CCOL_PRFM: \n\t" +" cmp %3, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, %2 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL14(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13) +" \n\t" +" cmp %5, #0 \n\t" // If no 4-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" subs %5, %5, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp %6, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) +" ld1rd z14.d, p0/z, [%1, 8*0] \n\t" +" ld1rd z15.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z16.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z17.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z18.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z19.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z20.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z21.d, p0/z, [%1, 8*1] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*3] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*5] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*7] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*9] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*11] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*13] \n\t" +" add %1, %1, x3 \n\t" +GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) +" sub %6, %6, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). +// " ldr x8, %[beta] \n\t" +" ld1rd z28.d, p0/z, [%7] \n\t" // Real(alpha). +" ld1rd z29.d, p0/z, [%7, 8] \n\t" // Imag(alpha). +" ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). +" ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x9, %[a_next] \n\t" +// " ldr x10, %[b_next] \n\t" +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %9, %9, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %10, %10, x16 \n\t" +#endif +" prfm PLDL1STRM, [%9] \n\t" +" prfm PLDL1STRM, [%9, 256*1] \n\t" +" prfm PLDL1STRM, [%10] \n\t" +" prfm PLDL1STRM, [%10, 256*1] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +GEMM_FMULCMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z28,z29) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. +" cmp %3, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" +GEMM_CCMPLX_LOAD_COL7_C(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,x9,%4) +GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31) +GEMM_CCMPLX_STORE_COL7_C(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,%2,%4) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" +" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +" index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. +GEMM_CCMPLX_LOAD_COL7_G(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,z28,x9,%4,x16) +GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31) +GEMM_CCMPLX_STORE_COL7_G(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z28,%2,%4,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_EXEC: \n\t" +" mov %11, #0 \n\t" // Return normal. +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (cs_c), // %4 + "+r" (k_mker), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (a_next), // %9 + "+r" (b_next), // %10 + "=r" (info) // %11 +: +: "x2","x3","x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + + diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c new file mode 100644 index 0000000000..d0eef4a8ca --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c @@ -0,0 +1,290 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2019, Forschunszentrum Juelich + Copyright (C) 2020, The University of Tokyo + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ +#include "blis.h" + +// Double-precision composite instructions. +#include "armsve_asm_macros_dcomplex.h" + +// 2vx8 microkernels. +#include "armsve_asm_2vx8cmplx.h" + +void bli_zgemm_armsve_asm_2vx8_unindexed + ( + dim_t k0, + dcomplex* restrict alpha, + dcomplex* restrict a, + dcomplex* restrict b, + dcomplex* restrict beta, + dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_mker = k0 / 6; + uint64_t k_left = k0 % 6; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + uint64_t info = 0; + + __asm__ volatile ( +// " ldr x0, %[a] \n\t" +// " ldr x1, %[b] \n\t" +" mov x2, xzr \n\t" +" incd x2, ALL, MUL #1 \n\t" // Column-skip of A. +" mov x3, #8 \n\t" // Row-skip of B. +" \n\t" +// " ldr x2, %[c] \n\t" +// " ldr x3, %[rs_c] \n\t" // Row-skip of C. +// " ldr x4, %[cs_c] \n\t" // Column-skip of C. +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %0, %0, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %1, %1, x16 \n\t" +" mov x16, 0x3 \n\t" // Tag C address. +" lsl x16, x16, #56 \n\t" +" orr %2, %2, x16 \n\t" +#endif +" \n\t" +" mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). +" madd x2, x16, x2, xzr \n\t" // cs_a +" madd x3, x16, x3, xzr \n\t" // rs_b +" madd %4, x16, %4, xzr \n\t" // cs_c +" ptrue p0.d \n\t" +" \n\t" +// " ldr x5, %[k_mker] \n\t" // Number of loops. +// " ldr x6, %[k_left] \n\t" +" \n\t" +" LOAD_ABC: \n\t" +" cmp %5, #0 \n\t" // Don't preload if no microkernel there. +" b.eq END_CCOL_PRFM \n\t" +" \n\t" +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real & half of imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +" ld1rd z28.d, p0/z, [%1, 8*1] \n\t" +" ld1rd z29.d, p0/z, [%1, 8*3] \n\t" +" ld1rd z30.d, p0/z, [%1, 8*5] \n\t" +" ld1rd z31.d, p0/z, [%1, 8*7] \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) +" \n\t" +" CCOL_PRFM: \n\t" +" cmp %3, #1 \n\t" +" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +" mov x16, %2 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" add x16, x16, %4 \n\t" +" prfm PLDL1KEEP, [x16] \n\t" +" END_CCOL_PRFM: \n\t" +" \n\t" +CLEAR_COL16(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15) +" \n\t" +" cmp %5, #0 \n\t" // If no 6-microkernel can be applied +" b.eq K_LEFT_LOOP \n\t" +" \n\t" +" K_MKER_LOOP: \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +" subs %5, %5, #1 \n\t" // Decrease counter before final replica. +" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" b K_MKER_LOOP \n\t" +" \n\t" +" FIN_MKER_LOOP: \n\t" +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" \n\t" +" K_LEFT_LOOP: \n\t" +" cmp %6, #0 \n\t" // End of execution. +" b.eq WRITE_MEM_PREP \n\t" +" \n\t" +GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) +" ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Reload B's real & half of imaginary. +" ld1rd z21.d, p0/z, [%1, 8*2] \n\t" +" ld1rd z22.d, p0/z, [%1, 8*4] \n\t" +" ld1rd z23.d, p0/z, [%1, 8*6] \n\t" +" ld1rd z24.d, p0/z, [%1, 8*8] \n\t" +" ld1rd z25.d, p0/z, [%1, 8*10] \n\t" +" ld1rd z26.d, p0/z, [%1, 8*12] \n\t" +" ld1rd z27.d, p0/z, [%1, 8*14] \n\t" +" ld1rd z28.d, p0/z, [%1, 8*1] \n\t" +" ld1rd z29.d, p0/z, [%1, 8*3] \n\t" +" ld1rd z30.d, p0/z, [%1, 8*5] \n\t" +" ld1rd z31.d, p0/z, [%1, 8*7] \n\t" +GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) +" sub %6, %6, #1 \n\t" +" b K_LEFT_LOOP \n\t" // Next column / row. +" \n\t" +" WRITE_MEM_PREP: \n\t" +" \n\t" +// " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). +// " ldr x8, %[beta] \n\t" +" ld1rd z16.d, p0/z, [%7] \n\t" // Real(alpha). +" ld1rd z17.d, p0/z, [%7, 8] \n\t" // Imag(alpha). +" ld1rd z18.d, p0/z, [%8] \n\t" // Real(beta). +" ld1rd z19.d, p0/z, [%8, 8] \n\t" // Imag(beta). +" \n\t" +" PREFETCH_ABNEXT: \n\t" +// " ldr x9, %[a_next] \n\t" +// " ldr x10, %[b_next] \n\t" +#ifdef _A64FX +" mov x16, 0x1 \n\t" // Tag A address. +" lsl x16, x16, #56 \n\t" +" orr %9, %9, x16 \n\t" +" mov x16, 0x2 \n\t" // Tag B address. +" lsl x16, x16, #56 \n\t" +" orr %10, %10, x16 \n\t" +#endif +" prfm PLDL1STRM, [%9] \n\t" +" prfm PLDL1STRM, [%9, 256*1] \n\t" +" prfm PLDL1STRM, [%10] \n\t" +" prfm PLDL1STRM, [%10, 256*1] \n\t" +" \n\t" +" WRITE_MEM: \n\t" +" \n\t" +GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z16,z17) +GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z4 ,z5 ,z6 ,z7 ,z16,z17) +GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z8 ,z9 ,z10,z11,z16,z17) +GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z12,z13,z14,z15,z16,z17) +" \n\t" +" UNIT_ALPHA: \n\t" +" mov x9, %2 \n\t" // C address for loading. +" \n\t" // C address for storing is %2 itself. +" cmp %3, #1 \n\t" +" b.ne WRITE_MEM_G \n\t" +" \n\t" +" WRITE_MEM_C: \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19) +GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) +GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19) +GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) +GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) +" b END_WRITE_MEM \n\t" +" \n\t" +" WRITE_MEM_G: \n\t" +" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +" index z16.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19) +GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19) +GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z16,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z16,%2,%4,x16) +" \n\t" +GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16) +GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16) +GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19) +GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19) +GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z16,%2,%4,x16) +GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z16,%2,%4,x16) +" \n\t" +" END_WRITE_MEM: \n\t" +" b END_EXEC \n\t" +" \n\t" +" END_EXEC: \n\t" +" mov %11, #0 \n\t" // Return normal. +: "+r" (a), // %0 + "+r" (b), // %1 + "+r" (c), // %2 + "+r" (rs_c), // %3 + "+r" (cs_c), // %4 + "+r" (k_mker), // %5 + "+r" (k_left), // %6 + "+r" (alpha), // %7 + "+r" (beta), // %8 + "+r" (a_next), // %9 + "+r" (b_next), // %10 + "=r" (info) // %11 +: +: "x2","x3","x9","x16", + "z0","z1","z2","z3","z4","z5","z6","z7", + "z8","z9","z10","z11","z12","z13","z14","z15", + "z16","z17","z18","z19", + "z20","z21","z22","z23", + "z24","z25","z26","z27", + "z28","z29","z30","z31" + ); +} + diff --git a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c deleted file mode 100644 index ff3a35e7a6..0000000000 --- a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c +++ /dev/null @@ -1,450 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2019, Advanced Micro Devices, Inc. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#include "blis.h" - -// Separate instantiation for ArmSVE reference kernels. -// Temporary workaround. Will be removed after upstream has switched to a better way -// of exposing gemmsup interface. - -// -// -- Row storage case --------------------------------------------------------- -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by rows. */ \ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict ci = &c[ i*rs_c ]; \ - ctype* restrict ai = &a[ i*rs_a ]; \ -\ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cij = &ci[ j*cs_c ]; \ - ctype* restrict bj = &b [ j*cs_b ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 ) - -// -// -- Column storage case ------------------------------------------------------ -// - -#undef GENTFUNC -#define GENTFUNC( ctype, ch, opname, arch, suf ) \ -\ -void PASTEMAC3(ch,opname,arch,suf) \ - ( \ - conj_t conja, \ - conj_t conjb, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - ctype* restrict alpha, \ - ctype* restrict a, inc_t rs_a, inc_t cs_a, \ - ctype* restrict b, inc_t rs_b, inc_t cs_b, \ - ctype* restrict beta, \ - ctype* restrict c, inc_t rs_c, inc_t cs_c, \ - auxinfo_t* restrict data, \ - cntx_t* restrict cntx \ - ) \ -{ \ - /* NOTE: This microkernel can actually handle arbitrarily large - values of m, n, and k. */ \ -\ - if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ - } \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ - else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ - { \ - /* Traverse c by columns. */ \ - for ( dim_t j = 0; j < n; ++j ) \ - { \ - ctype* restrict cj = &c[ j*cs_c ]; \ - ctype* restrict bj = &b[ j*cs_b ]; \ -\ - for ( dim_t i = 0; i < m; ++i ) \ - { \ - ctype* restrict cij = &cj[ i*rs_c ]; \ - ctype* restrict ai = &a [ i*rs_a ]; \ - ctype ab; \ -\ - PASTEMAC(ch,set0s)( ab ); \ -\ - /* Perform a dot product to update the (i,j) element of c. */ \ - for ( dim_t l = 0; l < k; ++l ) \ - { \ - ctype* restrict aij = &ai[ l*cs_a ]; \ - ctype* restrict bij = &bj[ l*rs_b ]; \ -\ - PASTEMAC(ch,dots)( *aij, *bij, ab ); \ - } \ -\ - /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ - PASTEMAC(ch,conjs)( ab ); \ -\ - /* If beta is one, add ab into c. If beta is zero, overwrite c - with the result in ab. Otherwise, scale by beta and accumulate - ab to c. */ \ - if ( PASTEMAC(ch,eq1)( *beta ) ) \ - { \ - PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ - } \ - else if ( PASTEMAC(ch,eq0)( *beta ) ) \ - { \ - PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ - } \ - else \ - { \ - PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ - } \ - } \ - } \ - } \ -} - -INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 ) - diff --git a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c deleted file mode 100644 index 3341b63d00..0000000000 --- a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, The University of Tokyo - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -*/ -#include "blis.h" -#include - -// Double-precision composite instructions. -#include "../armsve_asm_macros_double.h" - -// 2vx10 microkernels. -#include "../armsve_asm_2vx10.h" - -// Prototype reference kernel. -GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 ) - -void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed - ( - conj_t conja, - conj_t conjb, - dim_t m0, - dim_t n0, - dim_t k0, - double* restrict alpha, - double* restrict a, inc_t rs_a0, inc_t cs_a0, - double* restrict b, inc_t rs_b0, inc_t cs_b0, - double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx - ) -{ - static int called = 0; - if ( !called ) - { - fprintf(stderr, "rv called.\n"); - called = 1; - } - // c*c requires A to be stored in columns. - assert( rs_a0 == 1 ); - - dim_t n0_mker = n0 / 10; - dim_t n0_left = n0 % 10; - - if ( n0_left ) - { - // A[:, ::] - // B[::, n0_mker*10:n0] - // C[: , n0_mker*10:n0] - double *ai = a; - double *bi = b + n0_mker * 10 * cs_b0; - double *ci = c + n0_mker * 10 * cs_c0; - bli_dgemmsup_c_armsve_ref2 - ( - conja, conjb, - m0, n0_left, k0, - alpha, - ai, rs_a0, cs_a0, - bi, rs_b0, cs_b0, - beta, - ci, rs_c0, cs_c0, - data, - cntx - ); - } - // Return if it's a pure edge case. - if ( !n0_mker ) - return; - - // Determine VL. - uint64_t vlen2; - __asm__ ( - " mov x0, xzr \n\t" - " incd x0, ALL, MUL #2 \n\t" - " mov %[vlen2], x0 \n\t" - : [vlen2] "=r" (vlen2) - : - : "x0" - ); - - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - // uint64_t rs_a = 1; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - uint64_t cs_b = cs_b0; - - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; - uint64_t n_mker = n0_mker; - - dim_t m0_mker = m0 / vlen2; - dim_t m0_left = m0 % vlen2; - if ( m0_left ) - { - // Edge case on A side can be handled with one more (predicated) loop. - m0_mker++; - } else - m0_left = vlen2; - // uint64_t ps_a = bli_auxinfo_ps_a( data ); - uint64_t ps_b = bli_auxinfo_ps_b( data ); - - for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker ) - { - uint64_t m_curr = vlen2; - if ( im0_mker == m0_mker - 1 ) - { - // Last m-loop. Maybe unnecessary. - m_curr = m0_left; - } - double *ai = a + im0_mker * vlen2 * rs_a0; - double *bi = b; - double *ci = c + im0_mker * vlen2 * rs_c0; - - void* a_next = bli_auxinfo_next_a( data ); - void* b_next = bli_auxinfo_next_b( data ); - - __asm__ volatile ( -" ldr x0, %[bi] \n\t" -" ldr x1, %[rs_b] \n\t" // Row-skip of B. -" ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]). -" ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B. -" ldr x4, %[cs_a] \n\t" // Column-Skip of A. -" \n\t" // Element skip of A[:, l] is guaranteed to be 1. -" ldr x5, %[ci] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. -" ldr x7, %[cs_c] \n\t" // Column-skip of C. -#ifdef _A64FX -" mov x16, 0x1 \n\t" // Tag C address. -" lsl x16, x16, #56 \n\t" -" orr x5, x5, x16 \n\t" -" mov x16, 0x2 \n\t" // Tag B address. -" lsl x16, x16, #56 \n\t" -" orr x0, x0, x16 \n\t" -#endif -" \n\t" -" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). -" madd x1, x8, x1, xzr \n\t" // rs_b -" madd x2, x8, x2, xzr \n\t" // cs_b -" madd x3, x8, x3, xzr \n\t" // ps_b -" madd x4, x8, x4, xzr \n\t" // cs_a -" madd x7, x8, x7, xzr \n\t" // cs_c -" mov x8, #4 \n\t" -" madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A. -" \n\t" -#ifdef _A64FX -" mov x16, 0x20 \n\t" // Higher 6bit for Control#2: -" lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong -" orr x16, x16, x4 \n\t" // Stride. -" msr S3_3_C11_C6_2, x16 \n\t" // Write system register. -#endif -" \n\t" -" ldr x8, %[m_curr] \n\t" // Size of first dimension. -" mov x9, xzr \n\t" -" incd x9 \n\t" -" ptrue p0.d \n\t" -" whilelo p1.d, xzr, x8 \n\t" -" whilelo p2.d, x9, x8 \n\t" -" \n\t" -" ldr x8, %[n_mker] \n\t" // Number of N-loops. -" \n\t" -" ldr x20, %[ai] \n\t" // Parameters to be reloaded -" ldr x21, %[k_mker] \n\t" // within each millikernel loop. -" ldr x22, %[k_left] \n\t" -" ldr x23, %[alpha] \n\t" -" ldr x24, %[beta] \n\t" -" ldr x25, %[a_next] \n\t" -" ldr x26, %[b_next] \n\t" -" ldr x23, [x23] \n\t" // Directly load alpha and beta. -" ldr x24, [x24] \n\t" -" \n\t" -" MILLIKER_MLOOP: \n\t" -" \n\t" -" mov x11, x0 \n\t" // B's address. -// " ldr x10, %[ai] \n\t" // A's address. -" mov x10, x20 \n\t" -// " ldr x12, %[k_mker] \n\t" -" mov x12, x21 \n\t" -// " ldr x13, %[k_left] \n\t" -" mov x13, x22 \n\t" -#ifdef _A64FX -" mov x16, 0x3 \n\t" // Tag A address. -" lsl x16, x16, #56 \n\t" -" orr x10, x10, x16 \n\t" -" mov x16, 0xa \n\t" // Control#2 for A address. -" lsl x16, x16, #60 \n\t" -" orr x10, x10, x16 \n\t" -#endif -" \n\t" -" cmp x12, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" -" \n\t" -" mov x14, x11 \n\t" -" ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row. -" add x14, x14, x2 \n\t" -" ld1rd z21.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z22.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z23.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z24.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z25.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z26.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z27.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left. -" add x14, x14, x2 \n\t" -" prfm PLDL1KEEP, [x14] \n\t" -" sub x14, x14, x2 \n\t" // Restore x14 to load edge. -" \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10) -" add x16, x10, x4 \n\t" -" prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A. -" add x16, x10, x4 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x10, x4 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" \n\t" -" CCOL_PRFM: \n\t" -" cmp x6, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. -" mov x16, x5 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" add x16, x16, x7 \n\t" -" prfm PLDL1STRM, [x16] \n\t" -" END_CCOL_PRFM: \n\t" -" \n\t" -CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) -" \n\t" -" cmp x12, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" -" \n\t" -" K_MKER_LOOP: \n\t" -" \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) -" \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) -" \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) -" \n\t" -" subs x12, x12, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. -" \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) -" b K_MKER_LOOP \n\t" -" \n\t" -" FIN_MKER_LOOP: \n\t" -GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) -" add x10, x10, x4 \n\t" // Forward A to fill the blank. -" \n\t" -" K_LEFT_LOOP: \n\t" -" cmp x13, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" -" \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10) -" mov x14, x11 \n\t" -" ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B. -" add x14, x14, x2 \n\t" -" ld1rd z21.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z22.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z23.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z24.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z25.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z26.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z27.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z28.d, p0/z, [x14] \n\t" -" add x14, x14, x2 \n\t" -" ld1rd z29.d, p0/z, [x14] \n\t" -GEMM_FMLA2(z0,z1,p0,z30,z31,z20) -GEMM_FMLA2(z2,z3,p0,z30,z31,z21) -GEMM_FMLA2(z4,z5,p0,z30,z31,z22) -GEMM_FMLA2(z6,z7,p0,z30,z31,z23) -GEMM_FMLA2(z8,z9,p0,z30,z31,z24) -GEMM_FMLA2(z10,z11,p0,z30,z31,z25) -GEMM_FMLA2(z12,z13,p0,z30,z31,z26) -GEMM_FMLA2(z14,z15,p0,z30,z31,z27) -GEMM_FMLA2(z16,z17,p0,z30,z31,z28) -GEMM_FMLA2(z18,z19,p0,z30,z31,z29) -" add x10, x10, x4 \n\t" // Forward A. -" add x11, x11, x1 \n\t" // Forward B. -" sub x13, x13, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. -" \n\t" -" WRITE_MEM_PREP: \n\t" -" \n\t" -// " ldr x10, %[ai] \n\t" -" mov x10, x20 \n\t" -" add x11, x0, x3 \n\t" -" dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors. -" dup z31.d, x24 \n\t" -" \n\t" -" cmp x8, #1 \n\t" -" b.eq PREFETCH_ABNEXT \n\t" -" prfm PLDL1STRM, [x10] \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" add x11, x11, x2 \n\t" -" prfm PLDL1KEEP, [x11] \n\t" -" b WRITE_MEM \n\t" -" \n\t" -" PREFETCH_ABNEXT: \n\t" -// " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. -" mov x1, x25 \n\t" -// " ldr x2, %[b_next] \n\t" -" mov x2, x26 \n\t" -" prfm PLDL2KEEP, [x1] \n\t" -" prfm PLDL2KEEP, [x1, 256*1] \n\t" -" prfm PLDL2KEEP, [x1, 256*2] \n\t" -" prfm PLDL2KEEP, [x1, 256*3] \n\t" -" prfm PLDL2KEEP, [x1, 256*4] \n\t" -" prfm PLDL2KEEP, [x1, 256*5] \n\t" -" prfm PLDL2KEEP, [x1, 256*6] \n\t" -" prfm PLDL2KEEP, [x1, 256*7] \n\t" -" prfm PLDL2KEEP, [x1, 256*8] \n\t" -" prfm PLDL2KEEP, [x1, 256*9] \n\t" -" prfm PLDL2KEEP, [x1, 256*10] \n\t" -" prfm PLDL2KEEP, [x1, 256*11] \n\t" -" prfm PLDL2KEEP, [x1, 256*12] \n\t" -" prfm PLDL2KEEP, [x1, 256*13] \n\t" -" prfm PLDL2KEEP, [x1, 256*14] \n\t" -" prfm PLDL2KEEP, [x1, 256*15] \n\t" -" prfm PLDL2KEEP, [x2] \n\t" -" prfm PLDL2KEEP, [x2, 256*1] \n\t" -" prfm PLDL2KEEP, [x2, 256*2] \n\t" -" prfm PLDL2KEEP, [x2, 256*3] \n\t" -" prfm PLDL2KEEP, [x2, 256*4] \n\t" -" prfm PLDL2KEEP, [x2, 256*5] \n\t" -" prfm PLDL2KEEP, [x2, 256*6] \n\t" -" prfm PLDL2KEEP, [x2, 256*7] \n\t" -" prfm PLDL2KEEP, [x2, 256*8] \n\t" -" prfm PLDL2KEEP, [x2, 256*9] \n\t" -" \n\t" -" WRITE_MEM: \n\t" -" \n\t" -" fmov d28, #1.0 \n\t" -" fmov x16, d28 \n\t" -" cmp x16, x23 \n\t" -" b.eq UNIT_ALPHA \n\t" -" \n\t" -SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) -" \n\t" -" UNIT_ALPHA: \n\t" -" mov x9, x5 \n\t" // C address for loading. -" \n\t" // C address for storing is x5 itself. -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" -" \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-29]. -" mov x13, xzr \n\t" // C-column's physical 1-vector skip. -" incb x13 \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) -" \n\t" -GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x12, xzr \n\t" -" incb x12 \n\t" -" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) -" \n\t" -GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" subs x8, x8, #1 \n\t" -" b.eq END_EXEC \n\t" -" \n\t" // Address of C already forwarded to next column. -" add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel. -" b MILLIKER_MLOOP \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" -" mov x0, #0 \n\t" // Return normal. -: -: [bi] "m" (bi), - [rs_b] "m" (rs_b), - [cs_b] "m" (cs_b), - [ps_b] "m" (ps_b), - [cs_a] "m" (cs_a), - [ci] "m" (ci), - [rs_c] "m" (rs_c), - [cs_c] "m" (cs_c), - [m_curr] "m" (m_curr), - [n_mker] "m" (n_mker), - [ai] "m" (ai), - [k_mker] "m" (k_mker), - [k_left] "m" (k_left), - [alpha] "m" (alpha), - [beta] "m" (beta), - [a_next] "m" (a_next), - [b_next] "m" (b_next) -: "x0","x1","x2","x3","x4","x5","x6","x7","x8", - "x9","x10","x11","x12","x13","x14","x15","x16","x17", - "x20","x21","x22","x23","x24","x25","x26", - "z0","z1","z2","z3","z4","z5","z6","z7", - "z8","z9","z10","z11","z12","z13","z14","z15", - "z16","z17","z18","z19", - "z20","z21","z22","z23", - "z24","z25","z26","z27", - "z28","z29","z30","z31" - ); - } -} - -void bli_dgemmsup_rv_armsve_10x2v_unindexed - ( - conj_t conjat, - conj_t conjbt, - dim_t m0t, - dim_t n0t, - dim_t k0, - double* restrict alpha, - double* restrict at, inc_t rs_at0, inc_t cs_at0, - double* restrict bt, inc_t rs_bt0, inc_t cs_bt0, - double* restrict beta, - double* restrict ct, inc_t rs_ct0, inc_t cs_ct0, - auxinfo_t* restrict datat, - cntx_t* restrict cntx - ) -{ - auxinfo_t data; - bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data ); - bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data ); - bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data ); - bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data ); - bli_dgemmsup_cv_armsve_2vx10_unindexed - ( - conjbt, conjat, - n0t, m0t, k0, - alpha, - bt, cs_bt0, rs_bt0, - at, cs_at0, rs_at0, - beta, - ct, cs_ct0, rs_ct0, - &data, - cntx - ); -} - diff --git a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c deleted file mode 100644 index 6bcea73f5d..0000000000 --- a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c +++ /dev/null @@ -1,412 +0,0 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - Copyright (C) 2020, The University of Tokyo - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name(s) of the copyright holder(s) nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -*/ -#include "blis.h" -#include - -// Double-precision composite instructions. -#include "../armsve_asm_macros_double.h" - -// 2vx10 microkernels. -#include "../armsve_asm_2vx10.h" - -// Prototype reference kernel. -GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 ) - -void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed - ( - conj_t conja, - conj_t conjb, - dim_t m0, - dim_t n0, - dim_t k0, - double* restrict alpha, - double* restrict a, inc_t rs_a0, inc_t cs_a0, - double* restrict b, inc_t rs_b0, inc_t cs_b0, - double* restrict beta, - double* restrict c, inc_t rs_c0, inc_t cs_c0, - auxinfo_t* restrict data, - cntx_t* restrict cntx - ) -{ - static int called = 0; - if ( !called ) - { - fprintf(stderr, "rv called.\n"); - called = 1; - } - // r*r requires B to be stored in rows. - assert(cs_b0 == 1); - - dim_t n0_mker = n0 / 10; - dim_t n0_left = n0 % 10; - - if ( n0_left ) - { - // A[:, ::] - // B[::, n0_mker*10:n0] - // C[: , n0_mker*10:n0] - double *ai = a; - double *bi = b + n0_mker * 10 * cs_b0; - double *ci = c + n0_mker * 10 * cs_c0; - bli_dgemmsup_r_armsve_ref2 - ( - conja, conjb, - m0, n0_left, k0, - alpha, - ai, rs_a0, cs_a0, - bi, rs_b0, cs_b0, - beta, - ci, rs_c0, cs_c0, - data, - cntx - ); - } - // Return if it's a pure edge case. - if ( !n0_mker ) - return; - - // Determine VL. - uint64_t vlen2; - __asm__ ( - " mov x0, xzr \n\t" - " incd x0, ALL, MUL #2 \n\t" - " mov %[vlen2], x0 \n\t" - : [vlen2] "=r" (vlen2) - : - : "x0" - ); - - uint64_t rs_c = rs_c0; - uint64_t cs_c = cs_c0; - uint64_t rs_a = rs_a0; - uint64_t cs_a = cs_a0; - uint64_t rs_b = rs_b0; - // uint64_t cs_b = 1; - - uint64_t k_mker = k0 / 4; - uint64_t k_left = k0 % 4; - uint64_t m_mker = m0 / vlen2; - uint64_t m_left = m0 % vlen2; - if ( m_left ) - { - // Edge case on A side can be handled with one more (predicated) loop. - m_mker++; - } else - m_left = vlen2; - uint64_t ps_a = bli_auxinfo_ps_a( data ); - // uint64_t ps_b = bli_auxinfo_ps_b( data ); - - for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker ) - { - double *ai = a; - double *bi = b + in0_mker * 10 * cs_b0; - double *ci = c + in0_mker * 10 * cs_c0; - - void* a_next = bli_auxinfo_next_a( data ); - void* b_next = bli_auxinfo_next_b( data ); - - __asm__ volatile ( -" ldr x0, %[ai] \n\t" -" ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]). -" ldr x2, %[cs_a] \n\t" // Column-skip of A. -" ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A. -" ldr x4, %[rs_b] \n\t" // Row-Skip of B. -" \n\t" // Element skip of B[l, :] is guaranteed to be 1. -" ldr x5, %[ci] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. -" ldr x7, %[cs_c] \n\t" // Column-skip of C. -#ifdef _A64FX -" mov x16, 0x1 \n\t" // Tag C address. -" lsl x16, x16, #56 \n\t" -" orr x5, x5, x16 \n\t" -" mov x16, 0x2 \n\t" // Tag A address. -" lsl x16, x16, #56 \n\t" -" orr x0, x0, x16 \n\t" -#endif -" \n\t" -" mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). -" madd x2, x8, x2, xzr \n\t" // cs_a -" madd x3, x8, x3, xzr \n\t" // ps_a -" madd x4, x8, x4, xzr \n\t" // rs_b -" madd x7, x8, x7, xzr \n\t" // cs_c -" mov x8, xzr \n\t" -" incb x8 \n\t" -" madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip. -" mov x8, #4 \n\t" -" madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A. -// " mov x8, #4 \n\t" -// " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B. -" \n\t" -" ldr x8, %[m_mker] \n\t" // Number of M-loops. -" ptrue p0.d \n\t" -" ptrue p1.d \n\t" -" ptrue p2.d \n\t" -" \n\t" -" MILLIKER_MLOOP: \n\t" -" \n\t" -" cmp x8, #1 \n\t" -" b.ne UKER_BEGIN \n\t" -" \n\t" -" ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop. -" mov x11, xzr \n\t" -" incd x11 \n\t" -" whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2. -" whilelo p2.d, x11, x10 \n\t" -" \n\t" -" UKER_BEGIN: \n\t" -" mov x10, x0 \n\t" // A's address. -" ldr x11, %[bi] \n\t" // B's address. -" ldr x12, %[k_mker] \n\t" -" ldr x13, %[k_left] \n\t" -#ifdef _A64FX -" mov x16, 0x3 \n\t" // Tag B address. -" lsl x16, x16, #56 \n\t" -" orr x11, x11, x16 \n\t" -#endif -" \n\t" -" mov x16, x11 \n\t" // Prefetch first kernel of B. -" prfm PLDL1KEEP, [x16] \n\t" -" add x16, x16, x4 \n\t" -" prfm PLDL1KEEP, [x16] \n\t" -" add x16, x16, x4 \n\t" -" prfm PLDL1KEEP, [x16] \n\t" -" add x16, x16, x4 \n\t" -" prfm PLDL1KEEP, [x16] \n\t" -" \n\t" -" ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row. -" ld1rd z21.d, p0/z, [x11, #8] \n\t" -" ld1rd z22.d, p0/z, [x11, #16] \n\t" -" ld1rd z23.d, p0/z, [x11, #24] \n\t" -" ld1rd z24.d, p0/z, [x11, #32] \n\t" -" ld1rd z25.d, p0/z, [x11, #40] \n\t" -" ld1rd z26.d, p0/z, [x11, #48] \n\t" -" ld1rd z27.d, p0/z, [x11, #56] \n\t" -" \n\t" -" index z29.d, xzr, x1 \n\t" // First A column. -" \n\t" // Skips passed to index is not multiplied by 8. -GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16) -" \n\t" -CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) -" \n\t" -" cmp x12, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" -" \n\t" -" K_MKER_LOOP: \n\t" // Unroll the 4-loop. -" \n\t" -" index z31.d, xzr, x1 \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) -" \n\t" -" index z29.d, xzr, x1 \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) -" \n\t" -" index z31.d, xzr, x1 \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) -" \n\t" -" subs x12, x12, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. -" \n\t" -" index z29.d, xzr, x1 \n\t" -GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) -GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) -" b K_MKER_LOOP \n\t" -" \n\t" -" FIN_MKER_LOOP: \n\t" -GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) -" add x10, x10, x2 \n\t" // Forward A to fill the blank. -" \n\t" -" K_LEFT_LOOP: \n\t" -" cmp x13, #0 \n\t" -" b.eq WRITE_MEM_PREP \n\t" -" \n\t" -" index z31.d, xzr, x1 \n\t" -GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16) -" ld1rd z20.d, p0/z, [x11] \n\t" -" ld1rd z21.d, p0/z, [x11, #8] \n\t" -" ld1rd z22.d, p0/z, [x11, #16] \n\t" -" ld1rd z23.d, p0/z, [x11, #24] \n\t" -" ld1rd z24.d, p0/z, [x11, #32] \n\t" -" ld1rd z25.d, p0/z, [x11, #40] \n\t" -" ld1rd z26.d, p0/z, [x11, #48] \n\t" -" ld1rd z27.d, p0/z, [x11, #56] \n\t" -" ld1rd z28.d, p0/z, [x11, #64] \n\t" -" ld1rd z29.d, p0/z, [x11, #72] \n\t" -GEMM_FMLA2(z0,z1,p0,z30,z31,z20) -GEMM_FMLA2(z2,z3,p0,z30,z31,z21) -GEMM_FMLA2(z4,z5,p0,z30,z31,z22) -GEMM_FMLA2(z6,z7,p0,z30,z31,z23) -GEMM_FMLA2(z8,z9,p0,z30,z31,z24) -GEMM_FMLA2(z10,z11,p0,z30,z31,z25) -GEMM_FMLA2(z12,z13,p0,z30,z31,z26) -GEMM_FMLA2(z14,z15,p0,z30,z31,z27) -GEMM_FMLA2(z16,z17,p0,z30,z31,z28) -GEMM_FMLA2(z18,z19,p0,z30,z31,z29) -" add x10, x10, x2 \n\t" // Forward A. -" add x11, x11, x4 \n\t" // Forward B. -" sub x13, x13, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. -" \n\t" -" WRITE_MEM_PREP: \n\t" -" \n\t" -" ldr x11, %[bi] \n\t" -" ldr x12, %[alpha] \n\t" // Load alpha & beta. -" ldr x13, %[beta] \n\t" -" ld1rd z30.d, p0/z, [x12] \n\t" -" ld1rd z31.d, p0/z, [x13] \n\t" -" ldr x12, [x12] \n\t" -" \n\t" -" cmp x8, #1 \n\t" -" b.eq PREFETCH_ABNEXT \n\t" -" prfm PLDL2STRM, [x11] \n\t" -" b WRITE_MEM \n\t" -" \n\t" -" PREFETCH_ABNEXT: \n\t" -" ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. -" ldr x2, %[b_next] \n\t" -" prfm PLDL2KEEP, [x1] \n\t" -" prfm PLDL2KEEP, [x1, 256*1] \n\t" -" prfm PLDL2KEEP, [x1, 256*2] \n\t" -" prfm PLDL2KEEP, [x1, 256*3] \n\t" -" prfm PLDL2KEEP, [x1, 256*4] \n\t" -" prfm PLDL2KEEP, [x1, 256*5] \n\t" -" prfm PLDL2KEEP, [x1, 256*6] \n\t" -" prfm PLDL2KEEP, [x1, 256*7] \n\t" -" prfm PLDL2KEEP, [x1, 256*8] \n\t" -" prfm PLDL2KEEP, [x1, 256*9] \n\t" -" prfm PLDL2KEEP, [x1, 256*10] \n\t" -" prfm PLDL2KEEP, [x1, 256*11] \n\t" -" prfm PLDL2KEEP, [x1, 256*12] \n\t" -" prfm PLDL2KEEP, [x1, 256*13] \n\t" -" prfm PLDL2KEEP, [x1, 256*14] \n\t" -" prfm PLDL2KEEP, [x1, 256*15] \n\t" -" prfm PLDL2KEEP, [x2] \n\t" -" prfm PLDL2KEEP, [x2, 256*1] \n\t" -" prfm PLDL2KEEP, [x2, 256*2] \n\t" -" prfm PLDL2KEEP, [x2, 256*3] \n\t" -" prfm PLDL2KEEP, [x2, 256*4] \n\t" -" prfm PLDL2KEEP, [x2, 256*5] \n\t" -" prfm PLDL2KEEP, [x2, 256*6] \n\t" -" prfm PLDL2KEEP, [x2, 256*7] \n\t" -" prfm PLDL2KEEP, [x2, 256*8] \n\t" -" prfm PLDL2KEEP, [x2, 256*9] \n\t" -" \n\t" -" WRITE_MEM: \n\t" -" \n\t" -" fmov d28, #1.0 \n\t" -" fmov x16, d28 \n\t" -" cmp x16, x12 \n\t" -" b.eq UNIT_ALPHA \n\t" -" \n\t" -SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) -" \n\t" -" UNIT_ALPHA: \n\t" -" mov x9, x5 \n\t" // C address for loading. -" mov x10, x5 \n\t" // C address for storing. -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" -" \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-29]. -" mov x13, xzr \n\t" // C-column's physical 1-vector skip. -" incb x13 \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) -" \n\t" -GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x12, xzr \n\t" -" incb x12 \n\t" -" madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) -GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) -GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) -" \n\t" -GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16) -GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" subs x8, x8, #1 \n\t" -" b.eq END_EXEC \n\t" -" \n\t" -" add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel. -" add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel. -" add x5, x5, x13 \n\t" -" b MILLIKER_MLOOP \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" -" mov x0, #0 \n\t" // Return normal. -: -: [ai] "m" (ai), - [rs_a] "m" (rs_a), - [cs_a] "m" (cs_a), - [ps_a] "m" (ps_a), - [rs_b] "m" (rs_b), - [ci] "m" (ci), - [rs_c] "m" (rs_c), - [cs_c] "m" (cs_c), - [m_mker] "m" (m_mker), - [m_left] "m" (m_left), - [bi] "m" (bi), - [k_mker] "m" (k_mker), - [k_left] "m" (k_left), - [alpha] "m" (alpha), - [beta] "m" (beta), - [a_next] "m" (a_next), - [b_next] "m" (b_next) -: "x0","x1","x2","x3","x4","x5","x6","x7","x8", - "x9","x10","x11","x12","x13","x14","x15","x16",//"x17", - "z0","z1","z2","z3","z4","z5","z6","z7", - "z8","z9","z10","z11","z12","z13","z14","z15", - "z16","z17","z18","z19", - "z20","z21","z22","z23", - "z24","z25","z26","z27", - "z28","z29","z30","z31" - ); - } -} - diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 3ccd79b68e..79ac710ab4 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -35,9 +35,10 @@ GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) -GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) -GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) +GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed ) +GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) +GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) +GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk ) PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk )