diff --git a/.github/workflows/auto-release.yml b/.github/workflows/auto-release.yml
new file mode 100644
index 0000000000..0674fa4772
--- /dev/null
+++ b/.github/workflows/auto-release.yml
@@ -0,0 +1,115 @@
+name: Automatic release
+
+on:
+  push:
+    tags:
+    - 'v*'
+    - 'sv*'
+
+jobs:
+  create-rel:
+    name: BLIS release ${{ github.ref }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Create GH release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref }}
+          release_name: Automatic release.
+          body: |
+            Automatically generated by GitHub actions.
+          draft: false
+          prerelease: true
+    outputs:
+      upload-url: ${{ steps.create_release.outputs.upload_url }}
+
+  build-test-upload:
+    name: BLIS ${{ matrix.target.config }} - ${{ github.event_name }}
+    needs: create-rel
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - {
+              config: 'intel64',
+              cc: 'gcc',
+              cflags: '',
+              packages: 'gcc binutils'
+            }
+          - {
+              config: 'amd64',
+              cc: 'gcc',
+              cflags: '',
+              packages: 'gcc binutils'
+            }
+          # These Aarch64 configs would be merged after adding autodetection.
+          - {
+              config: 'cortexa57',
+              cc: 'aarch64-linux-gnu-gcc',
+              cflags: '-mno-outline-atomics',
+              packages: 'gcc-aarch64-linux-gnu libc6-dev-arm64-cross'
+            }
+          - {
+              config: 'armsve',
+              cc: 'aarch64-linux-gnu-gcc-10',
+              cflags: '-mno-outline-atomics',
+              packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross'
+            }
+          - {
+              config: 'a64fx',
+              cc: 'aarch64-linux-gnu-gcc-10',
+              cflags: '"-mno-outline-atomics -DCACHE_SECTOR_SIZE_READONLY"',
+              packages: 'gcc-10-aarch64-linux-gnu libc6-dev-arm64-cross'
+            }
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Install build dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y ${{ matrix.target.packages }}
+          ${{ matrix.target.cc }} --version
+
+      - name: Configure project
+        run: |
+          mkdir dest
+          ./configure -t none -p ./dest \
+            CC=${{ matrix.target.cc }} \
+            CFLAGS=${{ matrix.target.cflags }} \
+            ${{ matrix.target.config }}
+
+      - name: Build
+        run: |
+          make -j
+
+      - name: Quick check
+        if: startsWith(matrix.target.config, 'x86')
+        run: |
+          make -j checkblis-fast
+
+      - name: Install
+        run: |
+          make install
+
+      - name: Packup
+        run: |
+          cd dest
+          tar -zcvf ../libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz \
+            include lib
+          cd ..
+
+      - name: Upload GH release asset
+        id: upload-release-asset 
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-rel.outputs.upload-url }}
+          asset_path: ./libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz
+          asset_name: libblis_${{ matrix.target.config }}_${{ matrix.target.cc }}.tar.gz
+          asset_content_type: application/x-tar
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index 5061570f80..22ef4a944f 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -49,9 +49,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	// their storage preferences.
 	bli_cntx_set_l3_nat_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  4,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
 	  cntx
 	);
 
@@ -67,11 +69,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    10,    10,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   256,   128,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  2048,  2048,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,    16,     8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    10,    10,    10,    10 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   256,   128,   192,    96 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  2048,  2048,  1536,  1536 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
@@ -86,57 +88,6 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	  cntx
 	);
 
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   65,   -1,   -1 );
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    10,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    16,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
-#endif
-
 	// Set A64FX cache sector sizes for each PE/CMG
 	// SC Fugaku might disable users' setting cache sizes.
 #if !defined(CACHE_SECTOR_SIZE_READONLY)
diff --git a/config/armsve/bli_armsve_config_utils.c b/config/armsve/bli_armsve_config_utils.c
index fdddeebabe..70501e39db 100644
--- a/config/armsve/bli_armsve_config_utils.c
+++ b/config/armsve/bli_armsve_config_utils.c
@@ -89,4 +89,6 @@ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \
 
 EXPANDMAC_BLKSZ_ARMSVE( s, 4 )
 EXPANDMAC_BLKSZ_ARMSVE( d, 8 )
+EXPANDMAC_BLKSZ_ARMSVE( c, 8  )
+EXPANDMAC_BLKSZ_ARMSVE( z, 16 )
 
diff --git a/config/armsve/bli_armsve_config_utils.h b/config/armsve/bli_armsve_config_utils.h
index 07aa9ba7d2..87bba73ed5 100644
--- a/config/armsve/bli_armsve_config_utils.h
+++ b/config/armsve/bli_armsve_config_utils.h
@@ -39,4 +39,6 @@ dim_t bli_vl_bits_armsve(void);
 
 void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
 void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
+void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_);
 
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 434979f915..6d4d356d62 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -50,17 +50,23 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	// Block size.
 	dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
 	dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
+	dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c;
+	dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z;
 	bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
 	bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
+	bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
+	bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
 
 	// Update the context with optimized native gemm micro-kernels and
 	// their storage preferences.
 	bli_cntx_set_l3_nat_ukrs
 	(
-	  2,
+	  4,
 	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
-	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
 	  cntx
 	);
 
@@ -84,11 +90,11 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z );
 
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
@@ -103,55 +109,5 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	  cntx
 	);
 
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  101,   -1,   -1 );
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1, n_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1, m_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  2048,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
-#endif
 }
 
diff --git a/docs/Performance.md b/docs/Performance.md
index 051be7aea9..f4992d1dee 100644
--- a/docs/Performance.md
+++ b/docs/Performance.md
@@ -550,9 +550,9 @@ The `runthese.m` file will contain example invocations of the function.
 * Operating system: RHEL 8.3
 * Page size: 256 bytes
 * Compiler: gcc 10.1.0
-* Results gathered: 2 April 2021; BLIS and SSL2 updated on 20 May 2021
+* Results gathered: 2 April 2021; BLIS and SSL2 updated on 21 Sept 2021
 * Implementations tested:
-  * BLIS 61584de (post-0.8.1)
+  * BLIS b05279d (post-0.8.1)
     * configured with:
       * `../configure -t none CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (single-threaded)
       * `../configure -t openmp CFLAGS="-DCACHE_SECTOR_SIZE_READONLY" a64fx` (multithreaded)
@@ -574,7 +574,7 @@ The `runthese.m` file will contain example invocations of the function.
     * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12`
     * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48`
     * **NOTE**: While this version of ARMPL does provide multithreaded implementations of `symm`/`hemm`, `syrk`/`herk`, `trmm`, or `trsm` (with the exception `dtrsm`), but these implementations yield very low performance, and their long run times led us to skip collecting these data altogether.
-  * Fujitsu SSL2 (Fujitsu toolchain 1.2.31)
+  * Fujitsu SSL2 (Fujitsu toolchain 1.2.33)
     * Single-threaded (1 core) execution requested via `export OMP_NUM_THREADS=1 NPARALLEL=1`
     * Multithreaded (12 core) execution requested via `export OMP_NUM_THREADS=12 NPARALLEL=12`
     * Multithreaded (48 core) execution requested via `export OMP_NUM_THREADS=48 NPARALLEL=48`
diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf
index e273d1d098..4d27944170 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf and b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.pdf differ
diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png
index 1316647d65..f51548effb 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png and b/docs/graphs/large/l3_perf_a64fx_jc1ic1jr12_nt12.png differ
diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf
index b311e0f5db..845dfaf862 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf and b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.pdf differ
diff --git a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png
index c2719da87a..08e46c6723 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png and b/docs/graphs/large/l3_perf_a64fx_jc1ic4jr12_nt48.png differ
diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.pdf b/docs/graphs/large/l3_perf_a64fx_nt1.pdf
index 6f0b8c74fc..97a31560a1 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_nt1.pdf and b/docs/graphs/large/l3_perf_a64fx_nt1.pdf differ
diff --git a/docs/graphs/large/l3_perf_a64fx_nt1.png b/docs/graphs/large/l3_perf_a64fx_nt1.png
index f2cb381786..0b7c2d72aa 100644
Binary files a/docs/graphs/large/l3_perf_a64fx_nt1.png and b/docs/graphs/large/l3_perf_a64fx_nt1.png differ
diff --git a/kernels/armsve/3/armsve_asm_2vx10cmplx.h b/kernels/armsve/3/armsve_asm_2vx10cmplx.h
new file mode 100644
index 0000000000..1b67d0d169
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_2vx10cmplx.h
@@ -0,0 +1,130 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \
+  GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \
+  GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \
+  GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \
+  GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \
+  GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \
+  GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \
+  GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \
+  GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \
+  GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \
+  GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \
+  \
+  GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \
+  GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \
+" add             "#BAddr", "#BRSBit", "#BAddr"   \n\t" /* B address forward */ \
+  GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV4,BAddr,0) \
+  GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV5,BAddr,2) \
+  GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV6,BAddr,4) \
+  GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV7,BAddr,6) \
+  GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV0,BAddr,8) \
+  GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV1,BAddr,10) \
+  GEMM_FMLX2_LD1R(C8Im,C8Re,PT,AColRe,AColIm,BV2,BAddr,12) \
+  GEMM_FMLX2_LD1R(C9Im,C9Re,PT,AColRe,AColIm,BV3,BAddr,14)
+
+#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \
+  GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit)
+
+#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \
+  GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \
+  GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \
+  GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \
+  GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \
+  GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \
+  GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \
+  GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \
+  GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \
+  GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \
+  GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \
+  \
+  GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \
+  GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \
+" add             "#BAddr", "#BRSBit", "#BAddr"   \n\t" /* B address forward */ \
+  GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV4) \
+  GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV5) \
+  GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV6) \
+  GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV7) \
+  GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV0) \
+  GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV1) \
+  GEMM_FMLX2(C8Im,C8Re,PT,AColRe,AColIm,BV2) \
+  GEMM_FMLX2(C9Im,C9Re,PT,AColRe,AColIm,BV3)
+
+#define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \
+  GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit)
+
+#define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \
+  CLEAR_COL4(Z00,Z01,Z02,Z03) \
+  CLEAR_COL4(Z04,Z05,Z06,Z07) \
+  CLEAR_COL4(Z08,Z09,Z10,Z11) \
+  CLEAR_COL4(Z12,Z13,Z14,Z15) \
+  CLEAR_COL4(Z16,Z17,Z18,Z19)
+
+// Moving is always .d.
+// Never use .DT here!
+#define MOV_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,Z0Re,Z0Im,Z1Re,Z1Im) \
+" mov "#ZD0Re".d, "#Z0Re".d \n\t" \
+" mov "#ZD0Im".d, "#Z0Im".d \n\t" \
+" mov "#ZD1Re".d, "#Z1Re".d \n\t" \
+" mov "#ZD1Im".d, "#Z1Im".d \n\t"
+
+#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
+  FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \
+  FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \
+  GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \
+  GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm)
+
+#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm)
+
+#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
+#define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
diff --git a/kernels/armsve/3/armsve_asm_2vx7cmplx.h b/kernels/armsve/3/armsve_asm_2vx7cmplx.h
new file mode 100644
index 0000000000..43997deef4
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_2vx7cmplx.h
@@ -0,0 +1,135 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \
+  GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,B0Re,BAddr,0) \
+  GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,B1Re,BAddr,2) \
+  GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,B2Re,BAddr,4) \
+  GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,B3Re,BAddr,6) \
+  GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,B4Re,BAddr,8) \
+  GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,B5Re,BAddr,10) \
+  GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,B6Re,BAddr,12) \
+  GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,B0Im,BAddr,1) \
+  GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,B1Im,BAddr,3) \
+  GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,B2Im,BAddr,5) \
+  GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,B3Im,BAddr,7) \
+  GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,B4Im,BAddr,9) \
+  GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,B5Im,BAddr,11) \
+  GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,B6Im,BAddr,13) \
+" add             "#BAddr", "#BRSBit", "#BAddr"   \n\t"
+
+#define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \
+  GEMM_FMLA2(C0Re,C0Im,PT,AColRe,AColIm,B0Re) \
+  GEMM_FMLA2(C1Re,C1Im,PT,AColRe,AColIm,B1Re) \
+  GEMM_FMLA2(C2Re,C2Im,PT,AColRe,AColIm,B2Re) \
+  GEMM_FMLA2(C3Re,C3Im,PT,AColRe,AColIm,B3Re) \
+  GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,B4Re) \
+  GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,B5Re) \
+  GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,B6Re) \
+  GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,B0Im) \
+  GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,B1Im) \
+  GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,B2Im) \
+  GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,B3Im) \
+  GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,B4Im) \
+  GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,B5Im) \
+  GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,B6Im)
+
+#define CLEAR_COL14(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13) \
+  CLEAR_COL4(Z00,Z01,Z02,Z03) \
+  CLEAR_COL4(Z04,Z05,Z06,Z07) \
+  CLEAR_COL4(Z08,Z09,Z10,Z11) \
+  CLEAR_COL2(Z12,Z13)
+
+#define GEMM_FMULCMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \
+  FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \
+  FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \
+  FMUL_COL2(ZD2Re,ZD2Im,Z2Re,Z2Im,ZFactorRe) \
+  FMUL_COL2(ZD3Re,ZD3Im,Z3Re,Z3Im,ZFactorRe) \
+  FMUL_COL2(ZD4Re,ZD4Im,Z4Re,Z4Im,ZFactorRe) \
+  FMUL_COL2(ZD5Re,ZD5Im,Z5Re,Z5Im,ZFactorRe) \
+  FMUL_COL2(ZD6Re,ZD6Im,Z6Re,Z6Im,ZFactorRe) \
+  GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \
+  GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) \
+  GEMM_FMLX2(ZD2Im,ZD2Re,PT,Z2Re,Z2Im,ZFactorIm) \
+  GEMM_FMLX2(ZD3Im,ZD3Re,PT,Z3Re,Z3Im,ZFactorIm) \
+  GEMM_FMLX2(ZD4Im,ZD4Re,PT,Z4Re,Z4Im,ZFactorIm) \
+  GEMM_FMLX2(ZD5Im,ZD5Re,PT,Z5Re,Z5Im,ZFactorIm) \
+  GEMM_FMLX2(ZD6Im,ZD6Re,PT,Z6Re,Z6Im,ZFactorIm)
+
+#define GEMM_FMLACMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD2Re,ZD2Im,PT,Z2Re,Z2Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD3Re,ZD3Im,PT,Z3Re,Z3Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD4Re,ZD4Im,PT,Z4Re,Z4Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD5Re,ZD5Im,PT,Z5Re,Z5Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD6Re,ZD6Im,PT,Z6Re,Z6Im,ZFactorRe,ZFactorIm)
+
+#define GEMM_CCMPLX_LOAD_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z6Re,Z6Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_STORE_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z6Re,Z6Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_LOAD_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
+#define GEMM_CCMPLX_STORE_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
diff --git a/kernels/armsve/3/armsve_asm_2vx8cmplx.h b/kernels/armsve/3/armsve_asm_2vx8cmplx.h
new file mode 100644
index 0000000000..16711930a4
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_2vx8cmplx.h
@@ -0,0 +1,116 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
+  GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \
+  GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \
+  GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \
+  GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \
+" add             "#BAddr", "#BRSBit", "#BAddr"   \n\t" /* B address forward */ \
+  GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,0) \
+  GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,2) \
+  GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,4) \
+  GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,6) \
+  \
+  GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV8,BAddr,8) \
+  GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV9,BAddr,10) \
+  GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV10,BAddr,12) \
+  GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV11,BAddr,14) \
+  GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV0,BAddr,1) \
+  GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV1,BAddr,3) \
+  GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV2,BAddr,5) \
+  GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV3,BAddr,7)
+
+#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
+  GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BAddr,BRSBit)
+
+#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
+  GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit)
+
+#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
+  GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \
+  GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \
+  GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \
+  GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \
+" add             "#BAddr", "#BRSBit", "#BAddr"   \n\t" /* B address forward */ \
+  GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,BV4) \
+  GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,BV5) \
+  GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,BV6) \
+  GEMM_FMLA2(C7Re,C7Im,PT,AColRe,AColIm,BV7) \
+  \
+  GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,BV8) \
+  GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,BV9) \
+  GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV10) \
+  GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV11) \
+  GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV0) \
+  GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV1) \
+  GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV2) \
+  GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV3)
+
+#define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \
+  GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit)
+
+#define CLEAR_COL16(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15) \
+  CLEAR_COL4(Z00,Z01,Z02,Z03) \
+  CLEAR_COL4(Z04,Z05,Z06,Z07) \
+  CLEAR_COL4(Z08,Z09,Z10,Z11) \
+  CLEAR_COL4(Z12,Z13,Z14,Z15)
+
+#define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
+  FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \
+  FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \
+  GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \
+  GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm)
+
+#define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \
+  GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm)
+
+#define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \
+  GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS)
+
+#define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
+#define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \
+  GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp)
+
diff --git a/kernels/armsve/3/armsve_asm_macros_cmplx.h b/kernels/armsve/3/armsve_asm_macros_cmplx.h
new file mode 100644
index 0000000000..10097700c8
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_macros_cmplx.h
@@ -0,0 +1,89 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "armsve_asm_macros.h"
+
+#define FMUL_COL2(ZD0,ZD1,Z0,Z1,ZFACTOR) \
+" fmul  "#ZD0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \
+" fmul  "#ZD1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \
+
+#define GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" fmla  "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" \
+" fmls  "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t"
+
+#define GEMM_FMLX2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \
+  GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \
+" "LD1R"  "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t"
+
+#define GEMM_FMULCMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \
+  FMUL_COL2(ZDRe,ZDIm,Z0Re,Z0Im,Z1Re) \
+  GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im)
+
+#define GEMM_FMLACMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \
+  GEMM_FMLA2(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re) \
+  GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im)
+
+#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \
+" "LD2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT"/z, ["#AAddr"] \n\t"
+
+#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \
+" "ST2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT", ["#AAddr"] \n\t"
+
+#define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,AAddr,ACS) \
+  GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \
+" add  "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */
+
+#define GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) \
+  GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS)
+
+#define GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,AAddr,ACS) \
+  GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \
+" add  "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */
+
+#define GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) \
+  GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS)
+
+#define GEMM_CCOLCMPLX_GATHER_LOAD_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \
+" add  "#CTemp", "#CAddr", #"SZ"  \n\t" /* Imaginary skip */ \
+" "LD1" "#ZRe"."DT", "#PRe"/z, ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \
+" "LD1" "#ZIm"."DT", "#PRe"/z, ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \
+" add  "#CAddr", "#CAddr", "#CCS" \n\t"
+
+#define GEMM_CCOLCMPLX_SCATTER_STORE_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \
+" add  "#CTemp", "#CAddr", #"SZ"  \n\t" /* Imaginary skip */ \
+" "ST1" "#ZRe"."DT", "#PRe", ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \
+" "ST1" "#ZIm"."DT", "#PRe", ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \
+" add  "#CAddr", "#CAddr", "#CCS" \n\t"
+
diff --git a/kernels/armsve/3/armsve_asm_macros_dcomplex.h b/kernels/armsve/3/armsve_asm_macros_dcomplex.h
new file mode 100644
index 0000000000..0beb5d2316
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_macros_dcomplex.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use double precision.
+#define DT    "d"
+#define LD1   "ld1d"
+#define ST1   "st1d"
+#define LD2   "ld2d"
+#define ST2   "st2d"
+#define LD1R  "ld1rd"
+#define PRFG  "prfd"
+#define SZ    "8"
+#define OFFS  "lsl #3"
+// Include macros.
+#include "armsve_asm_macros_cmplx.h"
+
diff --git a/kernels/armsve/3/armsve_asm_macros_scomplex.h b/kernels/armsve/3/armsve_asm_macros_scomplex.h
new file mode 100644
index 0000000000..f49cfedfba
--- /dev/null
+++ b/kernels/armsve/3/armsve_asm_macros_scomplex.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+// Specify to use single precision.
+#define DT    "s"
+#define LD1   "ld1w"
+#define ST1   "st1w"
+#define LD2   "ld2w"
+#define ST2   "st2w"
+#define LD1R  "ld1rw"
+#define PRFG  "prfw"
+#define SZ    "4"
+#define OFFS  "uxtw #2"
+// Include macros.
+#include "armsve_asm_macros_cmplx.h"
+
diff --git a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
index 01bb644b12..7262ac0e39 100644
--- a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
+++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
@@ -60,6 +60,10 @@ void bli_dgemm_armsve256_asm_8x8
 {
 	void* a_next = bli_auxinfo_next_a( data );
 	void* b_next = bli_auxinfo_next_b( data );
+	static int called = 0;
+	if (!called)
+		fprintf(stderr, "8x8 called\n");
+	called = 1;
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
@@ -70,27 +74,27 @@ void bli_dgemm_armsve256_asm_8x8
 
 __asm__ volatile
 (
-"                                            \n\t" 
-" ldr x0,%[aaddr]                            \n\t" // Load address of A 
-" ldr x1,%[baddr]                            \n\t" // Load address of B
-" ldr x2,%[caddr]                            \n\t" // Load address of C
-"                                            \n\t"
-" ldr x3,%[a_next]                           \n\t" // Move pointer
-" ldr x4,%[b_next]                           \n\t" // Move pointer
-"                                            \n\t"
-" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
-" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
-"                                            \n\t" 
-" ldr x7,%[alpha]                            \n\t" // Alpha address      
-" ldr x8,%[beta]                             \n\t" // Beta address      
-"                                            \n\t" 
-" ldr x9,%[cs_c]                             \n\t" // Load cs_c
-" lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
-"                                            \n\t"
-" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
-" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
-"                                            \n\t"
-" add x20,x2,x10                             \n\t" //Load address Column 1 of C
+// "                                            \n\t" 
+// " ldr x0,%[aaddr]                            \n\t" // Load address of A 
+// " ldr x1,%[baddr]                            \n\t" // Load address of B
+// " ldr x2,%[caddr]                            \n\t" // Load address of C
+// "                                            \n\t"
+// " ldr x3,%[a_next]                           \n\t" // Move pointer
+// " ldr x4,%[b_next]                           \n\t" // Move pointer
+// "                                            \n\t"
+// " ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
+// " ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
+// "                                            \n\t" 
+// " ldr x7,%[alpha]                            \n\t" // Alpha address      
+// " ldr x8,%[beta]                             \n\t" // Beta address      
+// "                                            \n\t" 
+// " ldr x9,%[cs_c]                             \n\t" // Load cs_c
+// " ldr x13,%[rs_c]                            \n\t" // Load rs_c.
+"                                            \n\t"
+" lsl x10,%9,#3                              \n\t" // cs_c * sizeof(double)
+" lsl x14,%10,#3                             \n\t" // rs_c * sizeof(double). 
+"                                            \n\t"
+" add x20,%2,x10                             \n\t" //Load address Column 1 of C
 " add x21,x20,x10                            \n\t" //Load address Column 2 of C
 " add x22,x21,x10                            \n\t" //Load address Column 3 of C
 " add x23,x22,x10                            \n\t" //Load address Column 4 of C
@@ -98,7 +102,7 @@ __asm__ volatile
 " add x25,x24,x10                            \n\t" //Load address Column 6 of C
 " add x26,x25,x10                            \n\t" //Load address Column 7 of C
 "                                            \n\t"
-" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+" prfm pldl1keep,[%2]                        \n\t" // Prefetch c.
 " prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
@@ -107,39 +111,39 @@ __asm__ volatile
 " prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
 " prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
 "                                            \n\t"
-" ldr  z0, [x0]                              \n\t" // Load a
-" ldr  z1, [x0, #1, MUL VL]                  \n\t"
+" ldr  z0, [%0]                              \n\t" // Load a
+" ldr  z1, [%0, #1, MUL VL]                  \n\t"
 "                                            \n\t"
 " ptrue   p0.d, all                          \n\t"
-" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" ld1rqd  {z2.d}, p0/z, [%1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
 "                                            \n\t"
-"                                            \n\t" // PRFM, the following prefetch on [x1] and [x0]
+"                                            \n\t" // PRFM, the following prefetch on [%1] and [%0]
 "                                            \n\t" //   is for b rows 4..7 and a columns 4..7.
 "                                            \n\t" //   both of them will be used in next iteration
 "                                            \n\t" //   of k_iter (unrolled per 4 loops)
 "                                            \n\t"
 " dup  z16.d, #0                             \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #256]              \n\t" // prefetch b row no.4
+" prfm    PLDL1KEEP, [%1, #256]              \n\t" // prefetch b row no.4
 " dup  z17.d, #0                             \n\t" // Vector for accummulating column 0
-" prfm    PLDL1KEEP, [x1, #320]              \n\t" // prefetch b row no.5
+" prfm    PLDL1KEEP, [%1, #320]              \n\t" // prefetch b row no.5
 " dup  z18.d, #0                             \n\t" // Vector for accummulating column 1
-" prfm    PLDL1KEEP, [x1, #384]              \n\t" // prefetch b row no.6
+" prfm    PLDL1KEEP, [%1, #384]              \n\t" // prefetch b row no.6
 " dup  z19.d, #0                             \n\t" // Vector for accummulating column 1
-" prfm    PLDL1KEEP, [x1, #448]              \n\t" // preftech b row no.7
+" prfm    PLDL1KEEP, [%1, #448]              \n\t" // preftech b row no.7
 " dup  z20.d, #0                             \n\t" // Vector for accummulating column 2 
 " dup  z21.d, #0                             \n\t" // Vector for accummulating column 2
 "                                            \n\t"
 " dup  z22.d, #0                             \n\t" // Vector for accummulating column 3
-" prfm    PLDL1KEEP, [x0, #256]              \n\t" // prefetch a col. no.4
+" prfm    PLDL1KEEP, [%0, #256]              \n\t" // prefetch a col. no.4
 " dup  z23.d, #0                             \n\t" // Vector for accummulating column 3
-" prfm    PLDL1KEEP, [x0, #320]              \n\t" // prefetch a col. no.5
+" prfm    PLDL1KEEP, [%0, #320]              \n\t" // prefetch a col. no.5
 " dup  z24.d, #0                             \n\t" // Vector for accummulating column 4
-" prfm    PLDL1KEEP, [x0, #384]              \n\t" // prefetch a col. no.6
+" prfm    PLDL1KEEP, [%0, #384]              \n\t" // prefetch a col. no.6
 " dup  z25.d, #0                             \n\t" // Vector for accummulating column 4
-" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.7
+" prfm    PLDL1KEEP, [%0, #448]              \n\t" // prefetch a col. no.7
 " dup  z26.d, #0                             \n\t" // Vector for accummulating column 5 
 " dup  z27.d, #0                             \n\t" // Vector for accummulating column 5
 "                                            \n\t"
@@ -149,157 +153,157 @@ __asm__ volatile
 " dup  z31.d, #0                             \n\t" // Vector for accummulating column 7
 "                                            \n\t"
 "                                            \n\t"
-" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+" cmp %5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
 " beq .DCONSIDERKLEFT                        \n\t"
 "                                            \n\t"
-" add x0, x0, #64                            \n\t" //update address of A
-" add x1, x1, #64                            \n\t" //update address of B
+" add %0, %0, #64                            \n\t" //update address of A
+" add %1, %1, #64                            \n\t" //update address of B
 "                                            \n\t"
-" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
+" cmp %5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
 " beq .DLASTITER                             \n\t" // (as loop is do-while-like).
 "                                            \n\t"
 " DLOOP:                                     \n\t" // Body
 "                                            \n\t"
 " fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
-" prfm    PLDL1KEEP, [x1, #448]              \n\t" // prefetch b row no.8, 512-64=448
+" prfm    PLDL1KEEP, [%1, #448]              \n\t" // prefetch b row no.8, 512-64=448
 " fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
-" prfm    PLDL1KEEP, [x1, #512]              \n\t" // prefetch b row no.9
+" prfm    PLDL1KEEP, [%1, #512]              \n\t" // prefetch b row no.9
 " fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" prfm    PLDL1KEEP, [x1, #576]              \n\t" // prefetch b row no.10
+" prfm    PLDL1KEEP, [%1, #576]              \n\t" // prefetch b row no.10
 "                                            \n\t"
 " fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
-" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+" ldr  z6, [%0]                              \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
 " fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
-" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z7, [%0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1]                 \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"                  // End it 1
 "                                            \n\t"
 " fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
-" prfm    PLDL1KEEP, [x1, #640]              \n\t" // prefetch b row no.11
+" prfm    PLDL1KEEP, [%1, #640]              \n\t" // prefetch b row no.11
 " fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
-" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.8
+" prfm    PLDL1KEEP, [%0, #448]              \n\t" // prefetch a col. no.8
 " fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" prfm    PLDL1KEEP, [x0, #512]              \n\t" // prefetch a col. no.9
+" prfm    PLDL1KEEP, [%0, #512]              \n\t" // prefetch a col. no.9
 "                                            \n\t"
 " fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
-" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+" ldr  z0, [%0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
 " fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
-" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z1, [%0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1, #64]            \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #80]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #96]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #112]           \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"
 "                                            \n\t"                  //End it 2
 "                                            \n\t"
 " fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
-" prfm    PLDL1KEEP, [x0, #576]              \n\t" // prefetch a col. no.10
+" prfm    PLDL1KEEP, [%0, #576]              \n\t" // prefetch a col. no.10
 " fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
-" prfm    PLDL1KEEP, [x0, #640]              \n\t" // prefetch a col. no.11
+" prfm    PLDL1KEEP, [%0, #640]              \n\t" // prefetch a col. no.11
 "                                            \n\t"
 " fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
 "                                            \n\t"
-" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+" add %1, %1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
 "                                            \n\t" //   in range -128 to 112
 "                                            \n\t"
 " fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
-" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+" ldr  z6, [%0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
 " fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
-" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z7, [%0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1, #0]             \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"                  // End it 3
 "                                            \n\t"
 " fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
 " fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
 " fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" ldr  z0, [x0, #6, MUL VL]                  \n\t" // Load a( 0:3,l )
+" ldr  z0, [%0, #6, MUL VL]                  \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
 " fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
-" ldr  z1, [x0, #7, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z1, [%0, #7, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
 " fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1, #64]            \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #80]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #96]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #112]           \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"                  //End it 4
-" add x0, x0, #256                           \n\t"
-" add x1, x1, #128                           \n\t"
+" add %0, %0, #256                           \n\t"
+" add %1, %1, #128                           \n\t"
 "                                            \n\t"
-" sub x5,x5,1                                \n\t" // i-=1
-" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+" sub %5,%5,1                                \n\t" // i-=1
+" cmp %5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
 " bne DLOOP                                  \n\t"
 "                                            \n\t"
 ".DLASTITER:                                 \n\t"
@@ -307,60 +311,60 @@ __asm__ volatile
 " fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
 " fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
 " fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+" ldr  z6, [%0]                              \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
 " fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
-" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z7, [%0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
 " fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1]                 \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"                  // End it 1
 "                                            \n\t"
 " fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
 " fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
 " fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+" ldr  z0, [%0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
 " fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
-" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z1, [%0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
 " fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1, #64]            \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #80]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #96]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #112]           \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"
 "                                            \n\t"                  //End it 2
@@ -368,32 +372,32 @@ __asm__ volatile
 " fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
 " fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
 " fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
-" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+" ldr  z6, [%0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
 "                                            \n\t"
 " fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
 " fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
 " fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
-" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+" ldr  z7, [%0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
 "                                            \n\t"
 " fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
-" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+" add %1, %1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
 "                                            \n\t" //   in range -128 to 112
 " fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
 " fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
-" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+" ld1rqd  {z2.d}, p0/z, [%1, #0]             \n\t" // load b( l,0:1 )
 "                                            \n\t"
 " fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
 " fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
 "                                            \n\t"
 " fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
 "                                            \n\t"
 " fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
 "                                            \n\t"
 "                                            \n\t"                  // End it 3
 "                                            \n\t"
@@ -414,7 +418,7 @@ __asm__ volatile
 "                                            \n\t"
 " fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
 " fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
-" add x1, x1, #64                            \n\t"
+" add %1, %1, #64                            \n\t"
 "                                            \n\t"
 " fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
 " fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
@@ -423,25 +427,25 @@ __asm__ volatile
 " fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
 "                                            \n\t"
 "                                            \n\t"                  //End it 4
-" add x0, x0, #192                           \n\t"
+" add %0, %0, #192                           \n\t"
 "                                            \n\t"
 " .DCONSIDERKLEFT:                           \n\t" 
-" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+" cmp %6,0                                   \n\t" // If k_left == 0, we are done.
 " beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
 "                                            \n\t"
 ".DLOOPKLEFT:                                \n\t"
 "                                            \n\t"
-" ldr  z0, [x0]                              \n\t" // Load a
-" ldr  z1, [x0, #1, MUL VL]                  \n\t"
-" add x0, x0, #64                            \n\t"
+" ldr  z0, [%0]                              \n\t" // Load a
+" ldr  z1, [%0, #1, MUL VL]                  \n\t"
+" add %0, %0, #64                            \n\t"
 "                                            \n\t"
-" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
-" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
-" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
-" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
-" add x1, x1, #64                            \n\t"
+" ld1rqd  {z2.d}, p0/z, [%1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [%1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [%1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [%1, #48]            \n\t" // load b( l,6:7 )
+" add %1, %1, #64                            \n\t"
 "                                            \n\t"
-" sub x6,x6,1                                \n\t"
+" sub %6,%6,1                                \n\t"
 "                                            \n\t"
 " fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
 " fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
@@ -467,15 +471,15 @@ __asm__ volatile
 " fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
 " fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
 "                                            \n\t"
-" cmp x6,0                                   \n\t" // Iterate again.
+" cmp %6,0                                   \n\t" // Iterate again.
 " bne .DLOOPKLEFT                            \n\t" // if i!=0.
 "                                            \n\t"
 " .DPOSTACCUM:                               \n\t"
 "                                            \n\t"
-" ld1rd {z6.d}, p0/z, [x7]                   \n\t" // Load alpha.
-" ld1rd {z7.d}, p0/z, [x8]                   \n\t" // Load beta
+" ld1rd {z6.d}, p0/z, [%7]                   \n\t" // Load alpha.
+" ld1rd {z7.d}, p0/z, [%8]                   \n\t" // Load beta
 "                                            \n\t"
-" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" cmp %10,#1                                 \n\t" // If rs_c != 1 (column-major)
 " bne .DGENSTORED                            \n\t"
 "                                            \n\t"
 " .DCOLSTORED:                               \n\t" // C is column-major.
@@ -488,8 +492,8 @@ __asm__ volatile
 " fcmp d7,#0.0                               \n\t"
 " beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-" ldr z0, [x2]                               \n\t" //Load column 0 of C
-" ldr z1, [x2, #1, MUL VL]                   \n\t"
+" ldr z0, [%2]                               \n\t" //Load column 0 of C
+" ldr z1, [%2, #1, MUL VL]                   \n\t"
 "                                            \n\t"
 " ldr z2, [x20]                              \n\t" //Load column 1 of C
 " ldr z3, [x20, #1, MUL VL]                  \n\t"
@@ -506,8 +510,8 @@ __asm__ volatile
 " fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
 " fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" str z0, [x2]                               \n\t" //Store column 0 of C
-" str z1, [x2, #1, MUL VL]                   \n\t"
+" str z0, [%2]                               \n\t" //Store column 0 of C
+" str z1, [%2, #1, MUL VL]                   \n\t"
 "                                            \n\t"
 " str z2, [x20]                              \n\t" //Store column 1 of C
 " str z3, [x20, #1, MUL VL]                  \n\t"
@@ -597,8 +601,8 @@ __asm__ volatile
 "                                            \n\t"
 " .DBETAZEROCOLSTOREDS4:                     \n\t"
 "                                            \n\t"
-" prfm pldl2keep,[x3]                        \n\t"
-" prfm pldl2keep,[x4]                        \n\t"
+" prfm pldl2keep,[%3]                        \n\t"
+" prfm pldl2keep,[%4]                        \n\t"
 "                                            \n\t"
 " fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
 " fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
@@ -624,12 +628,12 @@ __asm__ volatile
 "                                            \n\t" //     loading/storing from column of *c
 "                                            \n\t"
 "                                            \n\t" // C's each column's address:
-"                                            \n\t" //     x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7)
-"                                            \n\t" //     x5, x6,  x7,  x8,  x16, x17, x18, x19: are addresses of c(4,0:7)
-" add  x5,  x15, x2                          \n\t" // x5  is address of c(4,0)
-" add  x6,  x15, x20                         \n\t" // x6  is address of c(4,1)
-" add  x7,  x15, x21                         \n\t" // x7  is address of c(4,2)
-" add  x8,  x15, x22                         \n\t" // x8  is address of c(4,3)
+"                                            \n\t" //     %2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7)
+"                                            \n\t" //     %5, %6,  %7,  %8,  x16, x17, x18, x19: are addresses of c(4,0:7)
+" add  %5,  x15, %2                          \n\t" // %5  is address of c(4,0)
+" add  %6,  x15, x20                         \n\t" // %6  is address of c(4,1)
+" add  %7,  x15, x21                         \n\t" // %7  is address of c(4,2)
+" add  %8,  x15, x22                         \n\t" // %8  is address of c(4,3)
 " add  x16, x15, x23                         \n\t" // x16 is address of c(4,4)
 " add  x17, x15, x24                         \n\t" // x17 is address of c(4,5)
 " add  x18, x15, x25                         \n\t" // x18 is address of c(4,6)
@@ -643,14 +647,14 @@ __asm__ volatile
 " fcmp d7,#0.0                               \n\t"
 " beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
-"                                            \n\t" // x2  is address of c(0,0)
-"                                            \n\t" // x5  is address of c(4,0)
+"                                            \n\t" // %2  is address of c(0,0)
+"                                            \n\t" // %5  is address of c(4,0)
 "                                            \n\t" // x20 is address of c(0,1)
-"                                            \n\t" // x6  is address of c(4,1)
-" ld1d {z0.d}, p0/z, [x2, z4.d]              \n\t" // Load c( 0:3,0 ) into z0
-" ld1d {z1.d}, p0/z, [x5, z4.d]              \n\t" // Load c( 4:7,0 ) into z1
+"                                            \n\t" // %6  is address of c(4,1)
+" ld1d {z0.d}, p0/z, [%2, z4.d]              \n\t" // Load c( 0:3,0 ) into z0
+" ld1d {z1.d}, p0/z, [%5, z4.d]              \n\t" // Load c( 4:7,0 ) into z1
 " ld1d {z2.d}, p0/z, [x20, z4.d]             \n\t" // Load c( 0:3,1 ) into z2
-" ld1d {z3.d}, p0/z, [x6 , z4.d]             \n\t" // Load c( 4:7,1 ) into z3
+" ld1d {z3.d}, p0/z, [%6 , z4.d]             \n\t" // Load c( 4:7,1 ) into z3
 "                                            \n\t"
 " fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
 " fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
@@ -664,10 +668,10 @@ __asm__ volatile
 " fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
 " fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
 "                                            \n\t"
-" st1d {z0.d}, p0, [x2 , z4.d]               \n\t" // Store c( 0:3,0 ) <- z0
-" st1d {z1.d}, p0, [x5 , z4.d]               \n\t" // Store c( 4:7,0 ) <- z1
+" st1d {z0.d}, p0, [%2 , z4.d]               \n\t" // Store c( 0:3,0 ) <- z0
+" st1d {z1.d}, p0, [%5 , z4.d]               \n\t" // Store c( 4:7,0 ) <- z1
 " st1d {z2.d}, p0, [x20, z4.d]               \n\t" // Store c( 0:3,1 ) <- z2
-" st1d {z3.d}, p0, [x6 , z4.d]               \n\t" // Store c( 4:7,1 ) <- z3
+" st1d {z3.d}, p0, [%6 , z4.d]               \n\t" // Store c( 4:7,1 ) <- z3
 "                                            \n\t"
 "                                            \n\t"
 "                                            \n\t"
@@ -680,13 +684,13 @@ __asm__ volatile
 " beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
 "                                            \n\t"
 "                                            \n\t" // x21 is address of c(0,2)
-"                                            \n\t" // x7  is address of c(4,2)
+"                                            \n\t" // %7  is address of c(4,2)
 "                                            \n\t" // x22 is address of c(0,3)
-"                                            \n\t" // x8  is address of c(4,3)
+"                                            \n\t" // %8  is address of c(4,3)
 " ld1d {z8.d},  p0/z, [x21, z4.d]            \n\t" // Load c( 0:3,2 ) into z8
-" ld1d {z9.d},  p0/z, [x7 , z4.d]            \n\t" // Load c( 4:7,2 ) into z9
+" ld1d {z9.d},  p0/z, [%7 , z4.d]            \n\t" // Load c( 4:7,2 ) into z9
 " ld1d {z10.d}, p0/z, [x22, z4.d]            \n\t" // Load c( 0:3,3 ) into z10
-" ld1d {z11.d}, p0/z, [x8 , z4.d]            \n\t" // Load c( 4:7,3 ) into z11
+" ld1d {z11.d}, p0/z, [%8 , z4.d]            \n\t" // Load c( 4:7,3 ) into z11
 "                                            \n\t"
 " fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
 " fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
@@ -701,9 +705,9 @@ __asm__ volatile
 " fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
 "                                            \n\t"
 " st1d {z8.d},  p0, [x21, z4.d]              \n\t" // Store c( 0:3,2 ) <- z8
-" st1d {z9.d},  p0, [x7 , z4.d]              \n\t" // Store c( 4:7,2 ) <- z9
+" st1d {z9.d},  p0, [%7 , z4.d]              \n\t" // Store c( 4:7,2 ) <- z9
 " st1d {z10.d}, p0, [x22, z4.d]              \n\t" // Store c( 0:3,3 ) <- z10
-" st1d {z11.d}, p0, [x8 , z4.d]              \n\t" // Store c( 4:7,3 ) <- z11
+" st1d {z11.d}, p0, [%8 , z4.d]              \n\t" // Store c( 4:7,3 ) <- z11
 "                                            \n\t"
 " dup  z0.d, #0                              \n\t" // C column 4, 5
 " dup  z1.d, #0                              \n\t"
@@ -775,24 +779,21 @@ __asm__ volatile
 "                                            \n\t"
 " .DEND:                                     \n\t" // Done!
 "                                            \n\t"
-:// output operands (none)
-:// input operands
- [aaddr]  "m" (a),      // 0
- [baddr]  "m" (b),      // 1
- [caddr]  "m" (c),      // 2
- [k_iter] "m" (k_iter), // 3
- [k_left] "m" (k_left), // 4
- [alpha]  "m" (alpha),  // 5
- [beta]   "m" (beta),   // 6
- [rs_c]   "m" (rs_c),   // 6
- [cs_c]   "m" (cs_c),   // 7
- [a_next] "m" (a_next), // 8
- [b_next] "m" (b_next)  // 9
+:// input/output operands
+ "+r" (a),      // %0
+ "+r" (b),      // %1
+ "+r" (c),      // %2
+ "+r" (a_next), // %3
+ "+r" (b_next), // %4
+ "+r" (k_iter), // %5
+ "+r" (k_left), // %6
+ "+r" (alpha),  // %7
+ "+r" (beta),   // %8
+ "+r" (cs_c),   // %9
+ "+r" (rs_c)    // %10 
+:// input-only operands (none)
 :// Register clobber list
- "x0","x1","x2","x3",
- "x4","x5","x6",
- "x7","x8","x9",
- "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19",
+ "x10","x11","x12","x14","x15","x16","x17","x18","x19",
  "x20","x21","x22","x23","x24","x25","x26",
  "x27",       
  "v0","v1","v2",
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
new file mode 100644
index 0000000000..4df75c7691
--- /dev/null
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -0,0 +1,300 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Single-precision composite instructions.
+#include "armsve_asm_macros_scomplex.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10cmplx.h"
+
+void bli_cgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       scomplex*  restrict alpha,
+       scomplex*  restrict a,
+       scomplex*  restrict b,
+       scomplex*  restrict beta,
+       scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t info   = 0;
+
+  __asm__ volatile (
+// " ldr             x0, %[a]                        \n\t"
+// " ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incw            x2, ALL, MUL #1                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+// " ldr             x2, %[c]                        \n\t"
+// " ldr             x3, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x4, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %0, %0, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %1, %1, x16                     \n\t"
+" mov             x16, 0x3                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %2, %2, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, #8                         \n\t" // Multiply some address skips by sizeof(scomplex).
+" madd            x2, x16, x2, xzr                \n\t" // cs_a
+" madd            x3, x16, x3, xzr                \n\t" // rs_b
+" madd            %4, x16, %4, xzr                \n\t" // cs_c
+" ptrue           p0.s                            \n\t"
+"                                                 \n\t"
+// " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
+// " ldr             x6, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+" ld1rw           z20.s, p0/z, [%1, 4*0]          \n\t" // Load B's real 8/10, no imaginary.
+" ld1rw           z21.s, p0/z, [%1, 4*2]          \n\t"
+" ld1rw           z22.s, p0/z, [%1, 4*4]          \n\t"
+" ld1rw           z23.s, p0/z, [%1, 4*6]          \n\t"
+" ld1rw           z24.s, p0/z, [%1, 4*8]          \n\t"
+" ld1rw           z25.s, p0/z, [%1, 4*10]         \n\t"
+" ld1rw           z26.s, p0/z, [%1, 4*12]         \n\t"
+" ld1rw           z27.s, p0/z, [%1, 4*14]         \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             %3, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, %2                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             %5, #0                          \n\t" // If no 4-microkernel can be applied.
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             %6, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+" ld1rw           z20.s, p0/z, [%1, 4*0]          \n\t" // Load B's real 8/10, no imaginary.
+" ld1rw           z21.s, p0/z, [%1, 4*2]          \n\t"
+" ld1rw           z22.s, p0/z, [%1, 4*4]          \n\t"
+" ld1rw           z23.s, p0/z, [%1, 4*6]          \n\t"
+" ld1rw           z24.s, p0/z, [%1, 4*8]          \n\t"
+" ld1rw           z25.s, p0/z, [%1, 4*10]         \n\t"
+" ld1rw           z26.s, p0/z, [%1, 4*12]         \n\t"
+" ld1rw           z27.s, p0/z, [%1, 4*14]         \n\t"
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" sub             %6, %6, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
+// " ldr             x8, %[beta]                     \n\t"
+" ld1rw           z28.s, p0/z, [%7]               \n\t" // Real(alpha).
+" ld1rw           z29.s, p0/z, [%7, 4]            \n\t" // Imag(alpha).
+" ld1rw           z30.s, p0/z, [%8]               \n\t" // Real(beta).
+" ld1rw           z31.s, p0/z, [%8, 4]            \n\t" // Imag(beta).
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x9,  %[a_next]                  \n\t"
+// " ldr             x10, %[b_next]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %9, %9, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %10, %10, x16                   \n\t"
+#endif
+" prfm            PLDL1STRM, [%9]                 \n\t"
+" prfm            PLDL1STRM, [%9, 256*1]          \n\t"
+" prfm            PLDL1STRM, [%10]                \n\t"
+" prfm            PLDL1STRM, [%10, 256*1]         \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+" fmov            s27, #1.0                       \n\t"
+" fcmp            s29, #0.0                       \n\t" // Whether Imag(alpha) == 0.
+" fccmp           s28, s27, 0, eq                 \n\t" // Whether Real(alpha) == 1.
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29)
+GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29)
+GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29)
+GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29)
+GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29)
+" b               WRITE_MEM_EXEC                  \n\t"
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 )
+MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 )
+MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11)
+MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15)
+MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
+"                                                 \n\t"
+" WRITE_MEM_EXEC:                                 \n\t"
+" mov             x9, %2                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is %2 itself.
+" cmp             %3, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t"
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" mov             x3, %3                          \n\t" //  s.t. 2*sizeof(float) = 2*4 = 8.
+" index           z28.s, wzr, w3                  \n\t"
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_EXEC:                                       \n\t"
+" mov             %11, #0                         \n\t" // Return normal.
+: "+r" (a),      // %0
+  "+r" (b),      // %1
+  "+r" (c),      // %2
+  "+r" (rs_c),   // %3
+  "+r" (cs_c),   // %4
+  "+r" (k_mker), // %5
+  "+r" (k_left), // %6
+  "+r" (alpha),  // %7
+  "+r" (beta),   // %8
+  "+r" (a_next), // %9
+  "+r" (b_next), // %10
+  "=r" (info)    // %11
+:
+: "x2","x3","x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+  );
+}
+
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
new file mode 100644
index 0000000000..90f212dbd1
--- /dev/null
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -0,0 +1,299 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Double-precision composite instructions.
+#include "armsve_asm_macros_dcomplex.h"
+
+// 2vx10 microkernels.
+#include "armsve_asm_2vx10cmplx.h"
+
+void bli_zgemm_armsve_asm_2vx10_unindexed
+     (
+       dim_t               k0,
+       dcomplex*  restrict alpha,
+       dcomplex*  restrict a,
+       dcomplex*  restrict b,
+       dcomplex*  restrict beta,
+       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t info   = 0;
+
+  __asm__ volatile (
+// " ldr             x0, %[a]                        \n\t"
+// " ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incd            x2, ALL, MUL #1                 \n\t" // Column-skip of A.
+" mov             x3, #10                         \n\t" // Row-skip of B.
+"                                                 \n\t"
+// " ldr             x2, %[c]                        \n\t"
+// " ldr             x3, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x4, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %0, %0, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %1, %1, x16                     \n\t"
+" mov             x16, 0x3                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %2, %2, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, #16                        \n\t" // Multiply some address skips by sizeof(dcomplex).
+" madd            x2, x16, x2, xzr                \n\t" // cs_a
+" madd            x3, x16, x3, xzr                \n\t" // rs_b
+" madd            %4, x16, %4, xzr                \n\t" // cs_c
+" ptrue           p0.d                            \n\t"
+"                                                 \n\t"
+// " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
+// " ldr             x6, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+" ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Load B's real 8/10, no imaginary.
+" ld1rd           z21.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*14]         \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             %3, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, %2                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
+"                                                 \n\t"
+" cmp             %5, #0                          \n\t" // If no 4-microkernel can be applied.
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             %6, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+" ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Load B's real 8/10, no imaginary.
+" ld1rd           z21.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*14]         \n\t"
+GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" sub             %6, %6, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
+// " ldr             x8, %[beta]                     \n\t"
+" ld1rd           z28.d, p0/z, [%7]               \n\t" // Real(alpha).
+" ld1rd           z29.d, p0/z, [%7, 8]            \n\t" // Imag(alpha).
+" ld1rd           z30.d, p0/z, [%8]               \n\t" // Real(beta).
+" ld1rd           z31.d, p0/z, [%8, 8]            \n\t" // Imag(beta).
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x9,  %[a_next]                  \n\t"
+// " ldr             x10, %[b_next]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %9, %9, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %10, %10, x16                   \n\t"
+#endif
+" prfm            PLDL1STRM, [%9]                 \n\t"
+" prfm            PLDL1STRM, [%9, 256*1]          \n\t"
+" prfm            PLDL1STRM, [%10]                \n\t"
+" prfm            PLDL1STRM, [%10, 256*1]         \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+" fmov            d27, #1.0                       \n\t"
+" fcmp            d29, #0.0                       \n\t" // Whether Imag(alpha) == 0.
+" fccmp           d28, d27, 0, eq                 \n\t" // Whether Real(alpha) == 1.
+" b.eq            UNIT_ALPHA                      \n\t"
+"                                                 \n\t"
+GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29)
+GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29)
+GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29)
+GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29)
+GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29)
+" b               WRITE_MEM_EXEC                  \n\t"
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 )
+MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 )
+MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11)
+MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15)
+MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19)
+"                                                 \n\t"
+" WRITE_MEM_EXEC:                                 \n\t"
+" mov             x9, %2                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is %2 itself.
+" cmp             %3, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t"
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31)
+GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31)
+GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_EXEC:                                       \n\t"
+" mov             %11, #0                         \n\t" // Return normal.
+: "+r" (a),      // %0
+  "+r" (b),      // %1
+  "+r" (c),      // %2
+  "+r" (rs_c),   // %3
+  "+r" (cs_c),   // %4
+  "+r" (k_mker), // %5
+  "+r" (k_left), // %6
+  "+r" (alpha),  // %7
+  "+r" (beta),   // %8
+  "+r" (a_next), // %9
+  "+r" (b_next), // %10
+  "=r" (info)    // %11
+:
+: "x2","x3","x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+  );
+}
+
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
new file mode 100644
index 0000000000..3d25719d92
--- /dev/null
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c
@@ -0,0 +1,266 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Double-precision composite instructions.
+#include "armsve_asm_macros_dcomplex.h"
+
+// 2vx7 microkernels.
+#include "armsve_asm_2vx7cmplx.h"
+
+void bli_zgemm_armsve_asm_2vx7_unindexed
+     (
+       dim_t               k0,
+       dcomplex*  restrict alpha,
+       dcomplex*  restrict a,
+       dcomplex*  restrict b,
+       dcomplex*  restrict beta,
+       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 4;
+  uint64_t k_left = k0 % 4;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t info   = 0;
+
+  __asm__ volatile (
+// " ldr             x0, %[a]                        \n\t"
+// " ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incd            x2, ALL, MUL #1                 \n\t" // Column-skip of A.
+" mov             x3, #7                          \n\t" // Row-skip of B.
+"                                                 \n\t"
+// " ldr             x2, %[c]                        \n\t"
+// " ldr             x3, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x4, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %0, %0, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %1, %1, x16                     \n\t"
+" mov             x16, 0x3                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %2, %2, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, #16                        \n\t" // Multiply some address skips by sizeof(dcomplex).
+" madd            x2, x16, x2, xzr                \n\t" // cs_a
+" madd            x3, x16, x3, xzr                \n\t" // rs_b
+" madd            %4, x16, %4, xzr                \n\t" // cs_c
+" ptrue           p0.d                            \n\t"
+"                                                 \n\t"
+// " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
+// " ldr             x6, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+"                                                 \n\t"
+" ld1rd           z14.d, p0/z, [%1, 8*0]          \n\t" // Load B's real & imaginary.
+" ld1rd           z15.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z16.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z17.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z18.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z19.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z20.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z21.d, p0/z, [%1, 8*1]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*3]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*5]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*7]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*9]          \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*11]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*13]         \n\t"
+" add             %1, %1, x3                      \n\t"
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             %3, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, %2                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL14(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13)
+"                                                 \n\t"
+" cmp             %5, #0                          \n\t" // If no 4-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2)
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             %6, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2)
+" ld1rd           z14.d, p0/z, [%1, 8*0]          \n\t"
+" ld1rd           z15.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z16.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z17.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z18.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z19.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z20.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z21.d, p0/z, [%1, 8*1]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*3]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*5]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*7]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*9]          \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*11]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*13]         \n\t"
+" add             %1, %1, x3                      \n\t"
+GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3)
+" sub             %6, %6, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
+// " ldr             x8, %[beta]                     \n\t"
+" ld1rd           z28.d, p0/z, [%7]               \n\t" // Real(alpha).
+" ld1rd           z29.d, p0/z, [%7, 8]            \n\t" // Imag(alpha).
+" ld1rd           z30.d, p0/z, [%8]               \n\t" // Real(beta).
+" ld1rd           z31.d, p0/z, [%8, 8]            \n\t" // Imag(beta).
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x9,  %[a_next]                  \n\t"
+// " ldr             x10, %[b_next]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %9, %9, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %10, %10, x16                   \n\t"
+#endif
+" prfm            PLDL1STRM, [%9]                 \n\t"
+" prfm            PLDL1STRM, [%9, 256*1]          \n\t"
+" prfm            PLDL1STRM, [%10]                \n\t"
+" prfm            PLDL1STRM, [%10, 256*1]         \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+GEMM_FMULCMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z28,z29)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, %2                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is %2 itself.
+" cmp             %3, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t"
+GEMM_CCMPLX_LOAD_COL7_C(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,x9,%4)
+GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31)
+GEMM_CCMPLX_STORE_COL7_C(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,%2,%4)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t"
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" index           z28.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
+GEMM_CCMPLX_LOAD_COL7_G(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,z28,x9,%4,x16)
+GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31)
+GEMM_CCMPLX_STORE_COL7_G(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z28,%2,%4,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_EXEC:                                       \n\t"
+" mov             %11, #0                         \n\t" // Return normal.
+: "+r" (a),      // %0
+  "+r" (b),      // %1
+  "+r" (c),      // %2
+  "+r" (rs_c),   // %3
+  "+r" (cs_c),   // %4
+  "+r" (k_mker), // %5
+  "+r" (k_left), // %6
+  "+r" (alpha),  // %7
+  "+r" (beta),   // %8
+  "+r" (a_next), // %9
+  "+r" (b_next), // %10
+  "=r" (info)    // %11
+:
+: "x2","x3","x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+  );
+}
+
+
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
new file mode 100644
index 0000000000..d0eef4a8ca
--- /dev/null
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c
@@ -0,0 +1,290 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2019, Forschunszentrum Juelich
+   Copyright (C) 2020, The University of Tokyo
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+#include "blis.h"
+
+// Double-precision composite instructions.
+#include "armsve_asm_macros_dcomplex.h"
+
+// 2vx8 microkernels.
+#include "armsve_asm_2vx8cmplx.h"
+
+void bli_zgemm_armsve_asm_2vx8_unindexed
+     (
+       dim_t               k0,
+       dcomplex*  restrict alpha,
+       dcomplex*  restrict a,
+       dcomplex*  restrict b,
+       dcomplex*  restrict beta,
+       dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+  void* a_next = bli_auxinfo_next_a( data );
+  void* b_next = bli_auxinfo_next_b( data );
+
+  // Typecast local copies of integers in case dim_t and inc_t are a
+  // different size than is expected by load instructions.
+  uint64_t k_mker = k0 / 6;
+  uint64_t k_left = k0 % 6;
+  uint64_t rs_c   = rs_c0;
+  uint64_t cs_c   = cs_c0;
+  uint64_t info   = 0;
+
+  __asm__ volatile (
+// " ldr             x0, %[a]                        \n\t"
+// " ldr             x1, %[b]                        \n\t"
+" mov             x2, xzr                         \n\t"
+" incd            x2, ALL, MUL #1                 \n\t" // Column-skip of A.
+" mov             x3, #8                          \n\t" // Row-skip of B.
+"                                                 \n\t"
+// " ldr             x2, %[c]                        \n\t"
+// " ldr             x3, %[rs_c]                     \n\t" // Row-skip of C.
+// " ldr             x4, %[cs_c]                     \n\t" // Column-skip of C.
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %0, %0, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %1, %1, x16                     \n\t"
+" mov             x16, 0x3                        \n\t" // Tag C address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %2, %2, x16                     \n\t"
+#endif
+"                                                 \n\t"
+" mov             x16, #16                        \n\t" // Multiply some address skips by sizeof(dcomplex).
+" madd            x2, x16, x2, xzr                \n\t" // cs_a
+" madd            x3, x16, x3, xzr                \n\t" // rs_b
+" madd            %4, x16, %4, xzr                \n\t" // cs_c
+" ptrue           p0.d                            \n\t"
+"                                                 \n\t"
+// " ldr             x5, %[k_mker]                   \n\t" // Number of loops.
+// " ldr             x6, %[k_left]                   \n\t"
+"                                                 \n\t"
+" LOAD_ABC:                                       \n\t"
+" cmp             %5, #0                          \n\t" // Don't preload if no microkernel there.
+" b.eq            END_CCOL_PRFM                   \n\t"
+"                                                 \n\t"
+" ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Load B's real & half of imaginary.
+" ld1rd           z21.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*14]         \n\t"
+" ld1rd           z28.d, p0/z, [%1, 8*1]          \n\t"
+" ld1rd           z29.d, p0/z, [%1, 8*3]          \n\t"
+" ld1rd           z30.d, p0/z, [%1, 8*5]          \n\t"
+" ld1rd           z31.d, p0/z, [%1, 8*7]          \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
+"                                                 \n\t"
+" CCOL_PRFM:                                      \n\t"
+" cmp             %3, #1                          \n\t"
+" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
+" mov             x16, %2                         \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" add             x16, x16, %4                    \n\t"
+" prfm            PLDL1KEEP, [x16]                \n\t"
+" END_CCOL_PRFM:                                  \n\t"
+"                                                 \n\t"
+CLEAR_COL16(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15)
+"                                                 \n\t"
+" cmp             %5, #0                          \n\t" // If no 6-microkernel can be applied
+" b.eq            K_LEFT_LOOP                     \n\t"
+"                                                 \n\t"
+" K_MKER_LOOP:                                    \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+" subs            %5, %5, #1                      \n\t" // Decrease counter before final replica.
+" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+" b               K_MKER_LOOP                     \n\t"
+"                                                 \n\t"
+" FIN_MKER_LOOP:                                  \n\t"
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+"                                                 \n\t"
+" K_LEFT_LOOP:                                    \n\t"
+" cmp             %6, #0                          \n\t" // End of execution.
+" b.eq            WRITE_MEM_PREP                  \n\t"
+"                                                 \n\t"
+GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2)
+" ld1rd           z20.d, p0/z, [%1, 8*0]          \n\t" // Reload B's real & half of imaginary.
+" ld1rd           z21.d, p0/z, [%1, 8*2]          \n\t"
+" ld1rd           z22.d, p0/z, [%1, 8*4]          \n\t"
+" ld1rd           z23.d, p0/z, [%1, 8*6]          \n\t"
+" ld1rd           z24.d, p0/z, [%1, 8*8]          \n\t"
+" ld1rd           z25.d, p0/z, [%1, 8*10]         \n\t"
+" ld1rd           z26.d, p0/z, [%1, 8*12]         \n\t"
+" ld1rd           z27.d, p0/z, [%1, 8*14]         \n\t"
+" ld1rd           z28.d, p0/z, [%1, 8*1]          \n\t"
+" ld1rd           z29.d, p0/z, [%1, 8*3]          \n\t"
+" ld1rd           z30.d, p0/z, [%1, 8*5]          \n\t"
+" ld1rd           z31.d, p0/z, [%1, 8*7]          \n\t"
+GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3)
+" sub             %6, %6, #1                      \n\t"
+" b               K_LEFT_LOOP                     \n\t" // Next column / row.
+"                                                 \n\t"
+" WRITE_MEM_PREP:                                 \n\t"
+"                                                 \n\t"
+// " ldr             x7, %[alpha]                    \n\t" // Load alpha & beta (address).
+// " ldr             x8, %[beta]                     \n\t"
+" ld1rd           z16.d, p0/z, [%7]               \n\t" // Real(alpha).
+" ld1rd           z17.d, p0/z, [%7, 8]            \n\t" // Imag(alpha).
+" ld1rd           z18.d, p0/z, [%8]               \n\t" // Real(beta).
+" ld1rd           z19.d, p0/z, [%8, 8]            \n\t" // Imag(beta).
+"                                                 \n\t"
+" PREFETCH_ABNEXT:                                \n\t"
+// " ldr             x9,  %[a_next]                  \n\t"
+// " ldr             x10, %[b_next]                  \n\t"
+#ifdef _A64FX
+" mov             x16, 0x1                        \n\t" // Tag A address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %9, %9, x16                     \n\t"
+" mov             x16, 0x2                        \n\t" // Tag B address.
+" lsl             x16, x16, #56                   \n\t"
+" orr             %10, %10, x16                   \n\t"
+#endif
+" prfm            PLDL1STRM, [%9]                 \n\t"
+" prfm            PLDL1STRM, [%9, 256*1]          \n\t"
+" prfm            PLDL1STRM, [%10]                \n\t"
+" prfm            PLDL1STRM, [%10, 256*1]         \n\t"
+"                                                 \n\t"
+" WRITE_MEM:                                      \n\t"
+"                                                 \n\t"
+GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z16,z17)
+GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z4 ,z5 ,z6 ,z7 ,z16,z17)
+GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z8 ,z9 ,z10,z11,z16,z17)
+GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z12,z13,z14,z15,z16,z17)
+"                                                 \n\t"
+" UNIT_ALPHA:                                     \n\t"
+" mov             x9, %2                          \n\t" // C address for loading.
+"                                                 \n\t" // C address for storing is %2 itself.
+" cmp             %3, #1                          \n\t"
+" b.ne            WRITE_MEM_G                     \n\t"
+"                                                 \n\t"
+" WRITE_MEM_C:                                    \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19)
+GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4)
+GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19)
+GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4)
+GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4)
+" b               END_WRITE_MEM                   \n\t"
+"                                                 \n\t"
+" WRITE_MEM_G:                                    \n\t"
+" add             %3, %3, %3                      \n\t" // Skips passed to index is multiplied by 2,
+" index           z16.d, xzr, %3                  \n\t" //  s.t. 2*sizeof(double) = 2*8 = 16.
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19)
+GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19)
+GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z16,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z16,%2,%4,x16)
+"                                                 \n\t"
+GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16)
+GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16)
+GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19)
+GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19)
+GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z16,%2,%4,x16)
+GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z16,%2,%4,x16)
+"                                                 \n\t"
+" END_WRITE_MEM:                                  \n\t"
+" b               END_EXEC                        \n\t"
+"                                                 \n\t"
+" END_EXEC:                                       \n\t"
+" mov             %11, #0                         \n\t" // Return normal.
+: "+r" (a),      // %0
+  "+r" (b),      // %1
+  "+r" (c),      // %2
+  "+r" (rs_c),   // %3
+  "+r" (cs_c),   // %4
+  "+r" (k_mker), // %5
+  "+r" (k_left), // %6
+  "+r" (alpha),  // %7
+  "+r" (beta),   // %8
+  "+r" (a_next), // %9
+  "+r" (b_next), // %10
+  "=r" (info)    // %11
+:
+: "x2","x3","x9","x16",
+  "z0","z1","z2","z3","z4","z5","z6","z7",
+  "z8","z9","z10","z11","z12","z13","z14","z15",
+  "z16","z17","z18","z19",
+  "z20","z21","z22","z23",
+  "z24","z25","z26","z27",
+  "z28","z29","z30","z31"
+  );
+}
+
diff --git a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c b/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
deleted file mode 100644
index ff3a35e7a6..0000000000
--- a/kernels/armsve/3/sup/bli_gemmsup_armsve_ref.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// Separate instantiation for ArmSVE reference kernels.
-// Temporary workaround. Will be removed after upstream has switched to a better way
-//  of exposing gemmsup interface.
-
-//
-// -- Row storage case ---------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
-\
-	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
-	{ \
-		/* Traverse c by rows. */ \
-		for ( dim_t i = 0; i < m; ++i ) \
-		{ \
-			ctype* restrict ci = &c[ i*rs_c ]; \
-			ctype* restrict ai = &a[ i*rs_a ]; \
-\
-			for ( dim_t j = 0; j < n; ++j ) \
-			{ \
-				ctype* restrict cij = &ci[ j*cs_c ]; \
-				ctype* restrict bj  = &b [ j*cs_b ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 )
-
-//
-// -- Column storage case ------------------------------------------------------
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t              conja, \
-       conj_t              conjb, \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	/* NOTE: This microkernel can actually handle arbitrarily large
-       values of m, n, and k. */ \
-\
-	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
-				} \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
-	{ \
-		/* Traverse c by columns. */ \
-		for ( dim_t j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict cj = &c[ j*cs_c ]; \
-			ctype* restrict bj = &b[ j*cs_b ]; \
-\
-			for ( dim_t i = 0; i < m; ++i ) \
-			{ \
-				ctype* restrict cij = &cj[ i*rs_c ]; \
-				ctype* restrict ai  = &a [ i*rs_a ]; \
-				ctype           ab; \
-\
-				PASTEMAC(ch,set0s)( ab ); \
-\
-				/* Perform a dot product to update the (i,j) element of c. */ \
-				for ( dim_t l = 0; l < k; ++l ) \
-				{ \
-					ctype* restrict aij = &ai[ l*cs_a ]; \
-					ctype* restrict bij = &bj[ l*rs_b ]; \
-\
-					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
-				} \
-\
-				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
-				PASTEMAC(ch,conjs)( ab ); \
-\
-				/* If beta is one, add ab into c. If beta is zero, overwrite c
-				   with the result in ab. Otherwise, scale by beta and accumulate
-				   ab to c. */ \
-				if ( PASTEMAC(ch,eq1)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
-				} \
-				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
-				} \
-				else \
-				{ \
-					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
-				} \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 )
-
diff --git a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
deleted file mode 100644
index 3341b63d00..0000000000
--- a/kernels/armsve/3/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, The University of Tokyo
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-*/
-#include "blis.h"
-#include <assert.h>
-
-// Double-precision composite instructions.
-#include "../armsve_asm_macros_double.h"
-
-// 2vx10 microkernels.
-#include "../armsve_asm_2vx10.h"
-
-// Prototype reference kernel.
-GEMMSUP_KER_PROT( double,   d, gemmsup_c_armsve_ref2 )
-
-void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-  static int called = 0;
-  if ( !called )
-  {
-    fprintf(stderr, "rv called.\n");
-    called = 1;
-  }
-  // c*c requires A to be stored in columns.
-  assert( rs_a0 == 1 );
-
-  dim_t n0_mker = n0 / 10;
-  dim_t n0_left = n0 % 10;
-
-  if ( n0_left )
-  {
-    // A[:, ::]
-    // B[::, n0_mker*10:n0]
-    // C[: , n0_mker*10:n0]
-    double *ai = a;
-    double *bi = b + n0_mker * 10 * cs_b0;
-    double *ci = c + n0_mker * 10 * cs_c0;
-    bli_dgemmsup_c_armsve_ref2
-    (
-      conja, conjb,
-      m0, n0_left, k0,
-      alpha,
-      ai, rs_a0, cs_a0,
-      bi, rs_b0, cs_b0,
-      beta,
-      ci, rs_c0, cs_c0,
-      data,
-      cntx
-    );
-  }
-  // Return if it's a pure edge case.
-  if ( !n0_mker )
-    return;
-
-  // Determine VL.
-  uint64_t vlen2;
-  __asm__ (
-    " mov  x0, xzr          \n\t"
-    " incd x0, ALL, MUL #2  \n\t"
-    " mov  %[vlen2], x0     \n\t"
-  : [vlen2] "=r" (vlen2)
-  :
-  : "x0"
-   );
-
-  uint64_t rs_c   = rs_c0;
-  uint64_t cs_c   = cs_c0;
-  // uint64_t rs_a   = 1;
-  uint64_t cs_a   = cs_a0;
-  uint64_t rs_b   = rs_b0;
-  uint64_t cs_b   = cs_b0;
-
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
-  uint64_t n_mker = n0_mker;
-
-  dim_t m0_mker = m0 / vlen2;
-  dim_t m0_left = m0 % vlen2;
-  if ( m0_left )
-  {
-    // Edge case on A side can be handled with one more (predicated) loop.
-    m0_mker++;
-  } else
-    m0_left = vlen2;
-  // uint64_t ps_a = bli_auxinfo_ps_a( data );
-  uint64_t ps_b = bli_auxinfo_ps_b( data );
-
-  for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker )
-  {
-    uint64_t m_curr = vlen2;
-    if ( im0_mker == m0_mker - 1 )
-    {
-      // Last m-loop. Maybe unnecessary.
-      m_curr = m0_left;
-    }
-    double *ai = a + im0_mker * vlen2 * rs_a0;
-    double *bi = b;
-    double *ci = c + im0_mker * vlen2 * rs_c0;
-
-    void* a_next = bli_auxinfo_next_a( data );
-    void* b_next = bli_auxinfo_next_b( data );
-
-    __asm__ volatile (
-" ldr             x0, %[bi]                       \n\t"
-" ldr             x1, %[rs_b]                     \n\t" // Row-skip of B.
-" ldr             x2, %[cs_b]                     \n\t" // Column-skip of B (element skip of B[l, :]).
-" ldr             x3, %[ps_b]                     \n\t" // Panel-skip (10*k) of B.
-" ldr             x4, %[cs_a]                     \n\t" // Column-Skip of A.
-"                                                 \n\t" // Element skip of A[:, l] is guaranteed to be 1.
-" ldr             x5, %[ci]                       \n\t"
-" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
-" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
-#ifdef _A64FX
-" mov             x16, 0x1                        \n\t" // Tag C address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x5, x5, x16                     \n\t"
-" mov             x16, 0x2                        \n\t" // Tag B address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x0, x0, x16                     \n\t"
-#endif
-"                                                 \n\t"
-" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
-" madd            x1, x8, x1, xzr                 \n\t" // rs_b
-" madd            x2, x8, x2, xzr                 \n\t" // cs_b
-" madd            x3, x8, x3, xzr                 \n\t" // ps_b
-" madd            x4, x8, x4, xzr                 \n\t" // cs_a
-" madd            x7, x8, x7, xzr                 \n\t" // cs_c
-" mov             x8, #4                          \n\t"
-" madd            x15, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for A.
-"                                                 \n\t"
-#ifdef _A64FX
-" mov             x16, 0x20                       \n\t" // Higher 6bit for Control#2:
-" lsl             x16, x16, #58                   \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong
-" orr             x16, x16, x4                    \n\t" // Stride.
-" msr             S3_3_C11_C6_2, x16              \n\t" // Write system register.
-#endif
-"                                                 \n\t"
-" ldr             x8, %[m_curr]                   \n\t" // Size of first dimension.
-" mov             x9, xzr                         \n\t"
-" incd            x9                              \n\t"
-" ptrue           p0.d                            \n\t"
-" whilelo         p1.d, xzr, x8                   \n\t"
-" whilelo         p2.d, x9, x8                    \n\t"
-"                                                 \n\t"
-" ldr             x8, %[n_mker]                   \n\t" // Number of N-loops.
-"                                                 \n\t"
-" ldr             x20, %[ai]                      \n\t" // Parameters to be reloaded
-" ldr             x21, %[k_mker]                  \n\t" //  within each millikernel loop.
-" ldr             x22, %[k_left]                  \n\t"
-" ldr             x23, %[alpha]                   \n\t"
-" ldr             x24, %[beta]                    \n\t"
-" ldr             x25, %[a_next]                  \n\t"
-" ldr             x26, %[b_next]                  \n\t"
-" ldr             x23, [x23]                      \n\t" // Directly load alpha and beta.
-" ldr             x24, [x24]                      \n\t"
-"                                                 \n\t"
-" MILLIKER_MLOOP:                                 \n\t"
-"                                                 \n\t"
-" mov             x11, x0                         \n\t" // B's address.
-// " ldr             x10, %[ai]                      \n\t" // A's address.
-" mov             x10, x20                        \n\t"
-// " ldr             x12, %[k_mker]                  \n\t"
-" mov             x12, x21                        \n\t"
-// " ldr             x13, %[k_left]                  \n\t"
-" mov             x13, x22                        \n\t"
-#ifdef _A64FX
-" mov             x16, 0x3                        \n\t" // Tag A address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x10, x10, x16                   \n\t"
-" mov             x16, 0xa                        \n\t" // Control#2 for A address.
-" lsl             x16, x16, #60                   \n\t"
-" orr             x10, x10, x16                   \n\t"
-#endif
-"                                                 \n\t"
-" cmp             x12, #0                         \n\t" // Don't preload if no microkernel there.
-" b.eq            END_CCOL_PRFM                   \n\t"
-"                                                 \n\t"
-" mov             x14, x11                        \n\t"
-" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 8/10 of first B row.
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z21.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z22.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z23.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z24.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z25.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z26.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z27.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" prfm            PLDL1KEEP, [x14]                \n\t" // And prefetch the 2/10 left.
-" add             x14, x14, x2                    \n\t"
-" prfm            PLDL1KEEP, [x14]                \n\t"
-" sub             x14, x14, x2                    \n\t" // Restore x14 to load edge.
-"                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10)
-" add             x16, x10, x4                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t" // Prefetch 3/4 of A.
-" add             x16, x10, x4                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x10, x4                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-"                                                 \n\t"
-" CCOL_PRFM:                                      \n\t"
-" cmp             x6, #1                          \n\t"
-" b.ne            END_CCOL_PRFM                   \n\t" // Do not prefetch for generic C storage.
-" mov             x16, x5                         \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" add             x16, x16, x7                    \n\t"
-" prfm            PLDL1STRM, [x16]                \n\t"
-" END_CCOL_PRFM:                                  \n\t"
-"                                                 \n\t"
-CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
-"                                                 \n\t"
-" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
-" b.eq            K_LEFT_LOOP                     \n\t"
-"                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t"
-"                                                 \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
-"                                                 \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
-"                                                 \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
-"                                                 \n\t"
-" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
-"                                                 \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
-" b               K_MKER_LOOP                     \n\t"
-"                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
-GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2)
-" add             x10, x10, x4                    \n\t" // Forward A to fill the blank.
-"                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
-" cmp             x13, #0                         \n\t" // End of execution.
-" b.eq            WRITE_MEM_PREP                  \n\t"
-"                                                 \n\t"
-GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10)
-" mov             x14, x11                        \n\t"
-" ld1rd           z20.d, p0/z, [x14]              \n\t" // Load 10/10 B.
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z21.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z22.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z23.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z24.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z25.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z26.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z27.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z28.d, p0/z, [x14]              \n\t"
-" add             x14, x14, x2                    \n\t"
-" ld1rd           z29.d, p0/z, [x14]              \n\t"
-GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
-GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
-GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
-GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
-GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
-GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
-GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
-GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
-GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
-GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
-" add             x10, x10, x4                    \n\t" // Forward A.
-" add             x11, x11, x1                    \n\t" // Forward B.
-" sub             x13, x13, #1                    \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
-"                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
-"                                                 \n\t"
-// " ldr             x10, %[ai]                      \n\t"
-" mov             x10, x20                        \n\t"
-" add             x11, x0, x3                     \n\t"
-" dup             z30.d, x23                      \n\t" // Broadcast alpha & beta into vectors.
-" dup             z31.d, x24                      \n\t"
-"                                                 \n\t"
-" cmp             x8, #1                          \n\t"
-" b.eq            PREFETCH_ABNEXT                 \n\t"
-" prfm            PLDL1STRM, [x10]                \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" add             x11, x11, x2                    \n\t"
-" prfm            PLDL1KEEP, [x11]                \n\t"
-" b               WRITE_MEM                       \n\t"
-"                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
-// " ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
-" mov             x1, x25                         \n\t"
-// " ldr             x2, %[b_next]                   \n\t"
-" mov             x2, x26                         \n\t"
-" prfm            PLDL2KEEP, [x1]                 \n\t"
-" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
-" prfm            PLDL2KEEP, [x2]                 \n\t"
-" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
-"                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
-"                                                 \n\t"
-" fmov            d28, #1.0                       \n\t"
-" fmov            x16, d28                        \n\t"
-" cmp             x16, x23                        \n\t"
-" b.eq            UNIT_ALPHA                      \n\t"
-"                                                 \n\t"
-SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
-"                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
-" mov             x9, x5                          \n\t" // C address for loading.
-"                                                 \n\t" // C address for storing is x5 itself.
-" cmp             x6, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
-"                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-29].
-" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
-" incb            x13                             \n\t"
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
-GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
-GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
-"                                                 \n\t"
-GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7)
-GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
-GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-" mov             x12, xzr                        \n\t"
-" incb            x12                             \n\t"
-" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
-" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
-GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
-GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
-"                                                 \n\t"
-GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16)
-GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
-GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" subs            x8, x8, #1                      \n\t"
-" b.eq            END_EXEC                        \n\t"
-"                                                 \n\t" // Address of C already forwarded to next column.
-" add             x0, x0, x3                      \n\t" // Forward B's base address to the next logic panel.
-" b               MILLIKER_MLOOP                  \n\t"
-"                                                 \n\t"
-" END_ERROR:                                      \n\t"
-" mov             x0, #1                          \n\t" // Return error.
-" END_EXEC:                                       \n\t"
-" mov             x0, #0                          \n\t" // Return normal.
-:
-: [bi]     "m" (bi),
-  [rs_b]   "m" (rs_b),
-  [cs_b]   "m" (cs_b),
-  [ps_b]   "m" (ps_b),
-  [cs_a]   "m" (cs_a),
-  [ci]     "m" (ci),
-  [rs_c]   "m" (rs_c),
-  [cs_c]   "m" (cs_c),
-  [m_curr] "m" (m_curr),
-  [n_mker] "m" (n_mker),
-  [ai]     "m" (ai),
-  [k_mker] "m" (k_mker),
-  [k_left] "m" (k_left),
-  [alpha]  "m" (alpha),
-  [beta]   "m" (beta),
-  [a_next] "m" (a_next),
-  [b_next] "m" (b_next)
-: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
-  "x9","x10","x11","x12","x13","x14","x15","x16","x17",
-  "x20","x21","x22","x23","x24","x25","x26",
-  "z0","z1","z2","z3","z4","z5","z6","z7",
-  "z8","z9","z10","z11","z12","z13","z14","z15",
-  "z16","z17","z18","z19",
-  "z20","z21","z22","z23",
-  "z24","z25","z26","z27",
-  "z28","z29","z30","z31"
-     );
-  }
-}
-
-void bli_dgemmsup_rv_armsve_10x2v_unindexed
-     (
-       conj_t              conjat,
-       conj_t              conjbt,
-       dim_t               m0t,
-       dim_t               n0t,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict at, inc_t rs_at0, inc_t cs_at0,
-       double*    restrict bt, inc_t rs_bt0, inc_t cs_bt0,
-       double*    restrict beta,
-       double*    restrict ct, inc_t rs_ct0, inc_t cs_ct0,
-       auxinfo_t* restrict datat,
-       cntx_t*    restrict cntx
-     )
-{
-  auxinfo_t data;
-  bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data );
-  bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data );
-  bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data );
-  bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data );
-  bli_dgemmsup_cv_armsve_2vx10_unindexed
-  (
-    conjbt, conjat,
-    n0t, m0t, k0,
-    alpha,
-    bt, cs_bt0, rs_bt0,
-    at, cs_at0, rs_at0,
-    beta,
-    ct, cs_ct0, rs_ct0,
-    &data,
-    cntx
-  );
-}
-
diff --git a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
deleted file mode 100644
index 6bcea73f5d..0000000000
--- a/kernels/armsve/3/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, The University of Tokyo
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-*/
-#include "blis.h"
-#include <assert.h>
-
-// Double-precision composite instructions.
-#include "../armsve_asm_macros_double.h"
-
-// 2vx10 microkernels.
-#include "../armsve_asm_2vx10.h"
-
-// Prototype reference kernel.
-GEMMSUP_KER_PROT( double,   d, gemmsup_r_armsve_ref2 )
-
-void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       double*    restrict alpha,
-       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       double*    restrict beta,
-       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-  static int called = 0;
-  if ( !called )
-  {
-    fprintf(stderr, "rv called.\n");
-    called = 1;
-  }
-  // r*r requires B to be stored in rows.
-  assert(cs_b0 == 1);
-
-  dim_t n0_mker = n0 / 10;
-  dim_t n0_left = n0 % 10;
-
-  if ( n0_left )
-  {
-    // A[:, ::]
-    // B[::, n0_mker*10:n0]
-    // C[: , n0_mker*10:n0]
-    double *ai = a;
-    double *bi = b + n0_mker * 10 * cs_b0;
-    double *ci = c + n0_mker * 10 * cs_c0;
-    bli_dgemmsup_r_armsve_ref2
-    (
-      conja, conjb,
-      m0, n0_left, k0,
-      alpha,
-      ai, rs_a0, cs_a0,
-      bi, rs_b0, cs_b0,
-      beta,
-      ci, rs_c0, cs_c0,
-      data,
-      cntx
-    );
-  }
-  // Return if it's a pure edge case.
-  if ( !n0_mker )
-    return;
-
-  // Determine VL.
-  uint64_t vlen2;
-  __asm__ (
-    " mov  x0, xzr          \n\t"
-    " incd x0, ALL, MUL #2  \n\t"
-    " mov  %[vlen2], x0     \n\t"
-  : [vlen2] "=r" (vlen2)
-  :
-  : "x0"
-   );
-
-  uint64_t rs_c   = rs_c0;
-  uint64_t cs_c   = cs_c0;
-  uint64_t rs_a   = rs_a0;
-  uint64_t cs_a   = cs_a0;
-  uint64_t rs_b   = rs_b0;
-  // uint64_t cs_b   = 1;
-
-  uint64_t k_mker = k0 / 4;
-  uint64_t k_left = k0 % 4;
-  uint64_t m_mker = m0 / vlen2;
-  uint64_t m_left = m0 % vlen2;
-  if ( m_left )
-  {
-    // Edge case on A side can be handled with one more (predicated) loop.
-    m_mker++;
-  } else
-    m_left = vlen2;
-  uint64_t ps_a = bli_auxinfo_ps_a( data );
-  // uint64_t ps_b = bli_auxinfo_ps_b( data );
-
-  for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker )
-  {
-    double *ai = a;
-    double *bi = b + in0_mker * 10 * cs_b0;
-    double *ci = c + in0_mker * 10 * cs_c0;
-
-    void* a_next = bli_auxinfo_next_a( data );
-    void* b_next = bli_auxinfo_next_b( data );
-
-    __asm__ volatile (
-" ldr             x0, %[ai]                       \n\t"
-" ldr             x1, %[rs_a]                     \n\t" // Row-skip of A (element skip of A[:, l]).
-" ldr             x2, %[cs_a]                     \n\t" // Column-skip of A.
-" ldr             x3, %[ps_a]                     \n\t" // Panel-skip (vlen2*k) of A.
-" ldr             x4, %[rs_b]                     \n\t" // Row-Skip of B.
-"                                                 \n\t" // Element skip of B[l, :] is guaranteed to be 1.
-" ldr             x5, %[ci]                       \n\t"
-" ldr             x6, %[rs_c]                     \n\t" // Row-skip of C.
-" ldr             x7, %[cs_c]                     \n\t" // Column-skip of C.
-#ifdef _A64FX
-" mov             x16, 0x1                        \n\t" // Tag C address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x5, x5, x16                     \n\t"
-" mov             x16, 0x2                        \n\t" // Tag A address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x0, x0, x16                     \n\t"
-#endif
-"                                                 \n\t"
-" mov             x8, #8                          \n\t" // Multiply some address skips by sizeof(double).
-" madd            x2, x8, x2, xzr                 \n\t" // cs_a
-" madd            x3, x8, x3, xzr                 \n\t" // ps_a
-" madd            x4, x8, x4, xzr                 \n\t" // rs_b
-" madd            x7, x8, x7, xzr                 \n\t" // cs_c
-" mov             x8, xzr                         \n\t"
-" incb            x8                              \n\t"
-" madd            x14, x8, x1, xzr                \n\t" // A-column's logical 1-vector skip.
-" mov             x8, #4                          \n\t"
-" madd            x15, x8, x2, xzr                \n\t" // Logical K=4 microkernel skip for A.
-// " mov             x8, #4                          \n\t"
-// " madd            x17, x8, x4, xzr                \n\t" // Logical K=4 microkernel skip for B.
-"                                                 \n\t"
-" ldr             x8, %[m_mker]                   \n\t" // Number of M-loops.
-" ptrue           p0.d                            \n\t"
-" ptrue           p1.d                            \n\t"
-" ptrue           p2.d                            \n\t"
-"                                                 \n\t"
-" MILLIKER_MLOOP:                                 \n\t"
-"                                                 \n\t"
-" cmp             x8, #1                          \n\t"
-" b.ne            UKER_BEGIN                      \n\t"
-"                                                 \n\t"
-" ldr             x10, %[m_left]                  \n\t" // Final (incomplete) millikernel loop.
-" mov             x11, xzr                        \n\t"
-" incd            x11                             \n\t"
-" whilelo         p1.d, xzr, x10                  \n\t" // Overwrite p1/p2.
-" whilelo         p2.d, x11, x10                  \n\t"
-"                                                 \n\t"
-" UKER_BEGIN:                                     \n\t"
-" mov             x10, x0                         \n\t" // A's address.
-" ldr             x11, %[bi]                      \n\t" // B's address.
-" ldr             x12, %[k_mker]                  \n\t"
-" ldr             x13, %[k_left]                  \n\t"
-#ifdef _A64FX
-" mov             x16, 0x3                        \n\t" // Tag B address.
-" lsl             x16, x16, #56                   \n\t"
-" orr             x11, x11, x16                   \n\t"
-#endif
-"                                                 \n\t"
-" mov             x16, x11                        \n\t" // Prefetch first kernel of B.
-" prfm            PLDL1KEEP, [x16]                \n\t"
-" add             x16, x16, x4                    \n\t"
-" prfm            PLDL1KEEP, [x16]                \n\t"
-" add             x16, x16, x4                    \n\t"
-" prfm            PLDL1KEEP, [x16]                \n\t"
-" add             x16, x16, x4                    \n\t"
-" prfm            PLDL1KEEP, [x16]                \n\t"
-"                                                 \n\t"
-" ld1rd           z20.d, p0/z, [x11]              \n\t" // (Partial) first B row.
-" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
-" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
-" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
-" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
-" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
-" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
-" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
-"                                                 \n\t"
-" index           z29.d, xzr, x1                  \n\t" // First A column.
-"                                                 \n\t" // Skips passed to index is not multiplied by 8.
-GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16)
-"                                                 \n\t"
-CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19)
-"                                                 \n\t"
-" cmp             x12, #0                         \n\t" // If no 4-microkernel can be applied
-" b.eq            K_LEFT_LOOP                     \n\t"
-"                                                 \n\t"
-" K_MKER_LOOP:                                    \n\t" // Unroll the 4-loop.
-"                                                 \n\t"
-" index           z31.d, xzr, x1                  \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
-"                                                 \n\t"
-" index           z29.d, xzr, x1                  \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
-"                                                 \n\t"
-" index           z31.d, xzr, x1                  \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
-"                                                 \n\t"
-" subs            x12, x12, #1                    \n\t" // Decrease counter before final replica.
-" b.eq            FIN_MKER_LOOP                   \n\t" // Branch early to avoid reading excess mem.
-"                                                 \n\t"
-" index           z29.d, xzr, x1                  \n\t"
-GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm)
-GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
-" b               K_MKER_LOOP                     \n\t"
-"                                                 \n\t"
-" FIN_MKER_LOOP:                                  \n\t"
-GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4)
-" add             x10, x10, x2                    \n\t" // Forward A to fill the blank.
-"                                                 \n\t"
-" K_LEFT_LOOP:                                    \n\t"
-" cmp             x13, #0                         \n\t"
-" b.eq            WRITE_MEM_PREP                  \n\t"
-"                                                 \n\t"
-" index           z31.d, xzr, x1                  \n\t"
-GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16)
-" ld1rd           z20.d, p0/z, [x11]              \n\t"
-" ld1rd           z21.d, p0/z, [x11, #8]          \n\t"
-" ld1rd           z22.d, p0/z, [x11, #16]         \n\t"
-" ld1rd           z23.d, p0/z, [x11, #24]         \n\t"
-" ld1rd           z24.d, p0/z, [x11, #32]         \n\t"
-" ld1rd           z25.d, p0/z, [x11, #40]         \n\t"
-" ld1rd           z26.d, p0/z, [x11, #48]         \n\t"
-" ld1rd           z27.d, p0/z, [x11, #56]         \n\t"
-" ld1rd           z28.d, p0/z, [x11, #64]         \n\t"
-" ld1rd           z29.d, p0/z, [x11, #72]         \n\t"
-GEMM_FMLA2(z0,z1,p0,z30,z31,z20)
-GEMM_FMLA2(z2,z3,p0,z30,z31,z21)
-GEMM_FMLA2(z4,z5,p0,z30,z31,z22)
-GEMM_FMLA2(z6,z7,p0,z30,z31,z23)
-GEMM_FMLA2(z8,z9,p0,z30,z31,z24)
-GEMM_FMLA2(z10,z11,p0,z30,z31,z25)
-GEMM_FMLA2(z12,z13,p0,z30,z31,z26)
-GEMM_FMLA2(z14,z15,p0,z30,z31,z27)
-GEMM_FMLA2(z16,z17,p0,z30,z31,z28)
-GEMM_FMLA2(z18,z19,p0,z30,z31,z29)
-" add             x10, x10, x2                    \n\t" // Forward A.
-" add             x11, x11, x4                    \n\t" // Forward B.
-" sub             x13, x13, #1                    \n\t"
-" b               K_LEFT_LOOP                     \n\t" // Next column / row.
-"                                                 \n\t"
-" WRITE_MEM_PREP:                                 \n\t"
-"                                                 \n\t"
-" ldr             x11, %[bi]                      \n\t"
-" ldr             x12, %[alpha]                   \n\t" // Load alpha & beta.
-" ldr             x13, %[beta]                    \n\t"
-" ld1rd           z30.d, p0/z, [x12]              \n\t"
-" ld1rd           z31.d, p0/z, [x13]              \n\t"
-" ldr             x12, [x12]                      \n\t"
-"                                                 \n\t"
-" cmp             x8, #1                          \n\t"
-" b.eq            PREFETCH_ABNEXT                 \n\t"
-" prfm            PLDL2STRM, [x11]                \n\t"
-" b               WRITE_MEM                       \n\t"
-"                                                 \n\t"
-" PREFETCH_ABNEXT:                                \n\t"
-" ldr             x1, %[a_next]                   \n\t" // Final Millikernel loop, x1 and x2 not needed.
-" ldr             x2, %[b_next]                   \n\t"
-" prfm            PLDL2KEEP, [x1]                 \n\t"
-" prfm            PLDL2KEEP, [x1, 256*1]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*2]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*3]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*4]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*5]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*6]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*7]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*8]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*9]          \n\t"
-" prfm            PLDL2KEEP, [x1, 256*10]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*11]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*12]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*13]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*14]         \n\t"
-" prfm            PLDL2KEEP, [x1, 256*15]         \n\t"
-" prfm            PLDL2KEEP, [x2]                 \n\t"
-" prfm            PLDL2KEEP, [x2, 256*1]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*2]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*3]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*4]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*5]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*6]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*7]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*8]          \n\t"
-" prfm            PLDL2KEEP, [x2, 256*9]          \n\t"
-"                                                 \n\t"
-" WRITE_MEM:                                      \n\t"
-"                                                 \n\t"
-" fmov            d28, #1.0                       \n\t"
-" fmov            x16, d28                        \n\t"
-" cmp             x16, x12                        \n\t"
-" b.eq            UNIT_ALPHA                      \n\t"
-"                                                 \n\t"
-SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30)
-"                                                 \n\t"
-" UNIT_ALPHA:                                     \n\t"
-" mov             x9, x5                          \n\t" // C address for loading.
-" mov             x10, x5                         \n\t" // C address for storing.
-" cmp             x6, #1                          \n\t"
-" b.ne            WRITE_MEM_G                     \n\t"
-"                                                 \n\t"
-" WRITE_MEM_C:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-29].
-" mov             x13, xzr                        \n\t" // C-column's physical 1-vector skip.
-" incb            x13                             \n\t"
-GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7)
-GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
-GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7)
-"                                                 \n\t"
-GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7)
-GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
-GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7)
-" b               END_WRITE_MEM                   \n\t"
-"                                                 \n\t"
-" WRITE_MEM_G:                                    \n\t" // Available scratch: Z[20-30].
-"                                                 \n\t" // Here used scratch: Z[20-30] - Z30 as index.
-" mov             x12, xzr                        \n\t"
-" incb            x12                             \n\t"
-" madd            x13, x12, x6, xzr               \n\t" // C-column's logical 1-vector skip.
-" index           z30.d, xzr, x6                  \n\t" // Skips passed to index is not multiplied by 8.
-GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16)
-GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31)
-GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16)
-"                                                 \n\t"
-GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16)
-GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31)
-GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16)
-"                                                 \n\t"
-" END_WRITE_MEM:                                  \n\t"
-" subs            x8, x8, #1                      \n\t"
-" b.eq            END_EXEC                        \n\t"
-"                                                 \n\t"
-" add             x0, x0, x3                      \n\t" // Forward A's base address to the next logic panel.
-" add             x5, x5, x13                     \n\t" // Forward C's base address to the next logic panel.
-" add             x5, x5, x13                     \n\t"
-" b               MILLIKER_MLOOP                  \n\t"
-"                                                 \n\t"
-" END_ERROR:                                      \n\t"
-" mov             x0, #1                          \n\t" // Return error.
-" END_EXEC:                                       \n\t"
-" mov             x0, #0                          \n\t" // Return normal.
-:
-: [ai]     "m" (ai),
-  [rs_a]   "m" (rs_a),
-  [cs_a]   "m" (cs_a),
-  [ps_a]   "m" (ps_a),
-  [rs_b]   "m" (rs_b),
-  [ci]     "m" (ci),
-  [rs_c]   "m" (rs_c),
-  [cs_c]   "m" (cs_c),
-  [m_mker] "m" (m_mker),
-  [m_left] "m" (m_left),
-  [bi]     "m" (bi),
-  [k_mker] "m" (k_mker),
-  [k_left] "m" (k_left),
-  [alpha]  "m" (alpha),
-  [beta]   "m" (beta),
-  [a_next] "m" (a_next),
-  [b_next] "m" (b_next)
-: "x0","x1","x2","x3","x4","x5","x6","x7","x8",
-  "x9","x10","x11","x12","x13","x14","x15","x16",//"x17",
-  "z0","z1","z2","z3","z4","z5","z6","z7",
-  "z8","z9","z10","z11","z12","z13","z14","z15",
-  "z16","z17","z18","z19",
-  "z20","z21","z22","z23",
-  "z24","z25","z26","z27",
-  "z28","z29","z30","z31"
-     );
-  }
-}
-
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
index 3ccd79b68e..79ac710ab4 100644
--- a/kernels/armsve/bli_kernels_armsve.h
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -35,9 +35,10 @@
 GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
 GEMM_UKR_PROT( double,   d, gemm_armsve_asm_2vx10_unindexed )
 GEMM_UKR_PROT( float,    s, gemm_armsve_asm_2vx10_unindexed )
-GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_2vx10_unindexed )
-GEMMSUP_KER_PROT( double,   d, gemmsup_cv_armsve_2vx10_unindexed )
-GEMMSUP_KER_PROT( double,   d, gemmsup_rv_armsve_10x2v_unindexed )
+GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed )
+GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed )
+GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed )
+GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed )
 
 PACKM_KER_PROT( double,   d, packm_armsve256_asm_8xk )
 PACKM_KER_PROT( double,   d, packm_armsve512_asm_16xk )