diff --git a/.wolfssl_known_macro_extras b/.wolfssl_known_macro_extras
index 636730d784..618b679755 100644
--- a/.wolfssl_known_macro_extras
+++ b/.wolfssl_known_macro_extras
@@ -546,6 +546,7 @@ WOLFCRYPT_FIPS_CORE_DYNAMIC_HASH_VALUE
 WOLFSENTRY_H
 WOLFSENTRY_NO_JSON
 WOLFSSL_32BIT_MILLI_TIME
+WOLFSSL_AARCH64_PRIVILEGE_MODE
 WOLFSSL_AESNI_BY4
 WOLFSSL_AESNI_BY6
 WOLFSSL_AFTER_DATE_CLOCK_SKEW
@@ -904,6 +905,7 @@ __MINGW32__
 __MINGW64_VERSION_MAJOR
 __MINGW64__
 __MWERKS__
+__OpenBSD__
 __PIE__
 __POWERPC__
 __PPC__
diff --git a/configure.ac b/configure.ac
index 16af3bb243..6a8edb2015 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2972,6 +2972,7 @@ then
 fi
 
 
+ENABLED_ARMASM_CRYPTO="unknown"
 ENABLED_ARMASM_INLINE="no"
 ENABLED_ARMASM_SHA3="no"
 ENABLED_ARMASM_CRYPTO_SM4="no"
@@ -2993,6 +2994,9 @@ then
         inline)
             ENABLED_ARMASM_INLINE=yes
             ;;
+        no-crypto)
+            ENABLED_ARMASM_CRYPTO=no
+            ;;
         sha512-crypto | sha3-crypto)
             case $host_cpu in
             *aarch64*)
@@ -3068,7 +3072,9 @@ then
             esac
             # Include options.h
             AM_CCASFLAGS="$AM_CCASFLAGS -DEXTERNAL_OPTS_OPENVPN"
-            ENABLED_ARMASM_CRYPTO=yes
+            if test "$ENABLED_ARMASM_CRYPTO" = "unknown"; then
+                ENABLED_ARMASM_CRYPTO=yes
+            fi
             ENABLED_ARMASM_NEON=yes
             ENABLED_ARM_64=yes
 
@@ -3169,6 +3175,9 @@ fi
 if test "$ENABLED_ARMASM_SM4" = "yes"; then
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_CRYPTO_SM4"
 fi
+if test "$ENABLED_ARMASM_CRYPTO" = "unknown"; then
+    ENABLED_ARMASM_CRYPTO=no
+fi
 if test "$ENABLED_ARMASM_CRYPTO" = "no"; then
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_ARMASM_NO_HW_CRYPTO"
 fi
diff --git a/wolfcrypt/benchmark/benchmark.c b/wolfcrypt/benchmark/benchmark.c
index 183ac5ff7a..7a80981fb1 100644
--- a/wolfcrypt/benchmark/benchmark.c
+++ b/wolfcrypt/benchmark/benchmark.c
@@ -229,6 +229,8 @@
     #include <wolfssl/wolfcrypt/async.h>
 #endif
 
+#include <wolfssl/wolfcrypt/cpuid.h>
+
 #ifdef USE_FLAT_BENCHMARK_H
     #include "benchmark.h"
 #else
@@ -3939,6 +3941,46 @@ static void* benchmarks_do(void* args)
     return NULL;
 }
 
+#if defined(HAVE_CPUID) && defined(WOLFSSL_TEST_STATIC_BUILD)
+static void print_cpu_features(void)
+{
+    word32 cpuid_flags = cpuid_get_flags();
+
+    printf("CPU: ");
+#ifdef HAVE_CPUID_INTEL
+    printf("Intel");
+#ifdef WOLFSSL_X86_64_BUILD
+    printf(" x86_64");
+#else
+    printf(" x86");
+#endif
+    printf(" -");
+    if (IS_INTEL_AVX1(cpuid_flags))   printf(" avx1");
+    if (IS_INTEL_AVX2(cpuid_flags))   printf(" avx2");
+    if (IS_INTEL_RDRAND(cpuid_flags)) printf(" rdrand");
+    if (IS_INTEL_RDSEED(cpuid_flags)) printf(" rdseed");
+    if (IS_INTEL_BMI2(cpuid_flags))   printf(" bmi2");
+    if (IS_INTEL_AESNI(cpuid_flags))  printf(" aesni");
+    if (IS_INTEL_ADX(cpuid_flags))    printf(" adx");
+    if (IS_INTEL_MOVBE(cpuid_flags))  printf(" movbe");
+    if (IS_INTEL_BMI1(cpuid_flags))   printf(" bmi1");
+    if (IS_INTEL_SHA(cpuid_flags))    printf(" sha");
+#endif
+#ifdef __aarch64__
+    printf("Aarch64 -");
+    if (IS_AARCH64_AES(cpuid_flags))    printf(" aes");
+    if (IS_AARCH64_PMULL(cpuid_flags))  printf(" pmull");
+    if (IS_AARCH64_SHA256(cpuid_flags)) printf(" sha256");
+    if (IS_AARCH64_SHA512(cpuid_flags)) printf(" sha512");
+    if (IS_AARCH64_RDM(cpuid_flags))    printf(" rdm");
+    if (IS_AARCH64_SHA3(cpuid_flags))   printf(" sha3");
+    if (IS_AARCH64_SM3(cpuid_flags))    printf(" sm3");
+    if (IS_AARCH64_SM4(cpuid_flags))    printf(" sm4");
+#endif
+    printf("\n");
+}
+#endif
+
 int benchmark_init(void)
 {
     int ret = 0;
@@ -3959,6 +4001,10 @@ int benchmark_init(void)
         return EXIT_FAILURE;
     }
 
+#if defined(HAVE_CPUID) && defined(WOLFSSL_TEST_STATIC_BUILD)
+    print_cpu_features();
+#endif
+
 #ifdef HAVE_WC_INTROSPECTION
     printf("Math: %s\n", wc_GetMathInfo());
 #endif
diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c
index 7f5e758475..f5d9f65ff0 100644
--- a/wolfcrypt/src/aes.c
+++ b/wolfcrypt/src/aes.c
@@ -106,7 +106,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
     #include <wolfcrypt/src/misc.c>
 #endif
 
-#if !defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_RISCV_ASM)
+#if !defined(WOLFSSL_RISCV_ASM)
 
 #ifdef WOLFSSL_IMX6_CAAM_BLOB
     /* case of possibly not using hardware acceleration for AES but using key
@@ -787,6 +787,26 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
         }
     #endif /* HAVE_AES_DECRYPT */
 
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+
+    #define NEED_AES_TABLES
+
+    static int checkedCpuIdFlags = 0;
+    static word32 cpuid_flags = 0;
+
+    static void Check_CPU_support_HwCrypto(Aes* aes)
+    {
+        if (checkedCpuIdFlags == 0) {
+            cpuid_flags = cpuid_get_flags();
+            checkedCpuIdFlags = 1;
+        }
+        aes->use_aes_hw_crypto = IS_AARCH64_AES(cpuid_flags);
+    #ifdef HAVE_AESGCM
+        aes->use_pmull_hw_crypto = IS_AARCH64_PMULL(cpuid_flags);
+    #endif
+    }
+
 #elif (defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES) \
         && !defined(WOLFSSL_QNX_CAAM)) || \
       ((defined(WOLFSSL_AFALG) || defined(WOLFSSL_DEVCRYPTO_AES)) && \
@@ -2875,6 +2895,13 @@ static WARN_UNUSED_RESULT int wc_AesEncrypt(
             printf("Skipping AES-NI\n");
         #endif
     }
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto) {
+        AES_encrypt_AARCH64(inBlock, outBlock, (byte*)aes->key,
+            (int)aes->rounds);
+        return 0;
+    }
 #endif /* WOLFSSL_AESNI */
 #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
     AES_ECB_encrypt(aes, inBlock, outBlock, WC_AES_BLOCK_SIZE);
@@ -3630,6 +3657,13 @@ static WARN_UNUSED_RESULT int wc_AesDecrypt(
             printf("Skipping AES-NI\n");
         #endif
     }
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto) {
+        AES_decrypt_AARCH64(inBlock, outBlock, (byte*)aes->key,
+            (int)aes->rounds);
+        return 0;
+    }
 #endif /* WOLFSSL_AESNI */
 #if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
     return AES_ECB_decrypt(aes, inBlock, outBlock, WC_AES_BLOCK_SIZE);
@@ -4580,6 +4614,14 @@ static void AesSetKey_C(Aes* aes, const byte* key, word32 keySz, int dir)
         }
     #endif /* WOLFSSL_AESNI */
 
+    #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+        !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        Check_CPU_support_HwCrypto(aes);
+        if (aes->use_aes_hw_crypto) {
+            return AES_set_key_AARCH64(userKey, keylen, aes, dir);
+        }
+    #endif
+
     #ifdef WOLFSSL_KCAPI_AES
         XMEMCPY(aes->devKey, userKey, keylen);
         if (aes->init != 0) {
@@ -5777,6 +5819,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
             }
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_CBC_encrypt_AARCH64(in, out, sz, (byte*)aes->reg,
+                (byte*)aes->key, (int)aes->rounds);
+            ret = 0;
+        }
+        else
     #endif
         {
             ret = 0;
@@ -5917,6 +5967,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
             ret = 0;
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_CBC_decrypt_AARCH64(in, out, sz, (byte*)aes->reg,
+                (byte*)aes->key, (int)aes->rounds);
+            ret = 0;
+        }
+        else
     #endif
         {
             ret = 0;
@@ -6255,6 +6313,14 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
             aes->left -= processed;
             sz -= processed;
 
+        #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+            !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+            if (aes->use_aes_hw_crypto) {
+                AES_CTR_encrypt_AARCH64(aes, out, in, sz);
+                return 0;
+            }
+        #endif
+
             VECTOR_REGISTERS_PUSH;
 
         #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \
@@ -6343,7 +6409,7 @@ int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
     #endif /* NEED_AES_CTR_SOFT */
 
 #endif /* WOLFSSL_AES_COUNTER */
-#endif /* !WOLFSSL_ARMASM && ! WOLFSSL_RISCV_ASM */
+#endif /* !WOLFSSL_RISCV_ASM */
 
 
 /*
@@ -6390,10 +6456,7 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz)
 
 #endif
 
-#ifdef WOLFSSL_ARMASM
-    /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */
-
-#elif defined(WOLFSSL_RISCV_ASM)
+#if defined(WOLFSSL_RISCV_ASM)
     /* implemented in wolfcrypt/src/port/risc-v/riscv-64-aes.c */
 
 #elif defined(WOLFSSL_AFALG)
@@ -6603,6 +6666,13 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
         return ret;
     #endif /* WOLFSSL_RENESAS_RSIP && WOLFSSL_RENESAS_FSPSM_CRYPTONLY*/
 
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (ret == 0 && aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) {
+        AES_GCM_set_key_AARCH64(aes, iv);
+    }
+    else
+#endif
 #if !defined(FREESCALE_LTC_AES_GCM)
     if (ret == 0) {
         VECTOR_REGISTERS_PUSH;
@@ -7320,6 +7390,8 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
  */
 #define GHASH_INIT_EXTRA(aes) WC_DO_NOTHING
 
+#if !defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \
+    defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
 /* GHASH one block of data..
  *
  * XOR block into tag and GMULT with H using pre-computed table.
@@ -7333,6 +7405,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
         GMULT(AES_TAG(aes), (aes)->gcm.M0);             \
     }                                                   \
     while (0)
+#endif
 #endif /* WOLFSSL_AESGCM_STREAM */
 #elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32)
 
@@ -7928,8 +8001,17 @@ static void GHASH_INIT(Aes* aes) {
     /* Reset counts of AAD and cipher text. */
     aes->aOver = 0;
     aes->cOver = 0;
-    /* Extra initialization based on implementation. */
-    GHASH_INIT_EXTRA(aes);
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) {
+        ; /* Don't do extra initialization. */
+    }
+    else
+#endif
+    {
+        /* Extra initialization based on implementation. */
+        GHASH_INIT_EXTRA(aes);
+    }
 }
 
 /* Update the GHASH with AAD and/or cipher text.
@@ -8590,6 +8672,14 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         }
     }
     else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) {
+        AES_GCM_encrypt_AARCH64(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+            authIn, authInSz);
+        ret = 0;
+    }
+    else
 #endif /* WOLFSSL_AESNI */
     {
         ret = AES_GCM_encrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
@@ -9174,6 +9264,13 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         }
     }
     else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) {
+        ret = AES_GCM_decrypt_AARCH64(aes, out, in, sz, iv, ivSz, authTag,
+            authTagSz, authIn, authInSz);
+    }
+    else
 #endif /* WOLFSSL_AESNI */
     {
         ret = AES_GCM_decrypt_C(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
@@ -10088,7 +10185,20 @@ int wc_AesGcmInit(Aes* aes, const byte* key, word32 len, const byte* iv,
                 RESTORE_VECTOR_REGISTERS();
             }
             else
-        #endif
+        #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+              !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+            if (aes->use_aes_hw_crypto) {
+                AES_GCM_init_AARCH64(aes, iv, ivSz);
+
+                /* Reset state fields. */
+                aes->over = 0;
+                aes->aSz = 0;
+                aes->cSz = 0;
+                /* Initialization for GHASH. */
+                GHASH_INIT(aes);
+            }
+            else
+        #endif /* WOLFSSL_AESNI */
             {
                 ret = AesGcmInit_C(aes, iv, ivSz);
             }
@@ -10214,6 +10324,13 @@ int wc_AesGcmEncryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_GCM_crypt_update_AARCH64(aes, out, in, sz);
+            GHASH_UPDATE_AARCH64(aes, authIn, authInSz, out, sz);
+        }
+        else
     #endif
         {
             /* Encrypt the plaintext. */
@@ -10267,6 +10384,12 @@ int wc_AesGcmEncryptFinal(Aes* aes, byte* authTag, word32 authTagSz)
             RESTORE_VECTOR_REGISTERS();
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_GCM_final_AARCH64(aes, authTag, authTagSz);
+        }
+        else
     #endif
         {
             ret = AesGcmFinal_C(aes, authTag, authTagSz);
@@ -10350,6 +10473,13 @@ int wc_AesGcmDecryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            GHASH_UPDATE_AARCH64(aes, authIn, authInSz, in, sz);
+            AES_GCM_crypt_update_AARCH64(aes, out, in, sz);
+        }
+        else
     #endif
         {
             /* Update the authentication tag with any authentication data and
@@ -10401,6 +10531,17 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz)
             RESTORE_VECTOR_REGISTERS();
         }
         else
+    #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+          !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            ALIGN32 byte calcTag[WC_AES_BLOCK_SIZE];
+            AES_GCM_final_AARCH64(aes, calcTag, authTagSz);
+            /* Check calculated tag matches the one passed in. */
+            if (ConstantCompare(authTag, calcTag, (int)authTagSz) != 0) {
+                ret = AES_GCM_AUTH_E;
+            }
+        }
+        else
     #endif
         {
             ALIGN32 byte calcTag[WC_AES_BLOCK_SIZE];
@@ -10677,10 +10818,7 @@ int wc_AesCcmCheckTagSize(int sz)
     return 0;
 }
 
-#ifdef WOLFSSL_ARMASM
-    /* implementation located in wolfcrypt/src/port/arm/armv8-aes.c */
-
-#elif defined(WOLFSSL_RISCV_ASM)
+#if defined(WOLFSSL_RISCV_ASM)
     /* implementation located in wolfcrypt/src/port/risc-v/riscv-64-aes.c */
 
 #elif defined(HAVE_COLDFIRE_SEC)
@@ -11686,6 +11824,12 @@ static WARN_UNUSED_RESULT int _AesEcbEncrypt(
         AES_ECB_encrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds);
     }
     else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto) {
+        AES_encrypt_AARCH64(in, out, (byte*)aes->key, (int)aes->rounds);
+    }
+    else
 #endif
     {
 #ifdef NEED_AES_TABLES
@@ -11738,6 +11882,12 @@ static WARN_UNUSED_RESULT int _AesEcbDecrypt(
         AES_ECB_decrypt_AESNI(in, out, sz, (byte*)aes->key, (int)aes->rounds);
     }
     else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    if (aes->use_aes_hw_crypto) {
+        AES_decrypt_AARCH64(in, out, (byte*)aes->key, (int)aes->rounds);
+    }
+    else
 #endif
     {
 #ifdef NEED_AES_TABLES
@@ -12838,7 +12988,6 @@ void AES_XTS_decrypt_update_avx1(const unsigned char *in, unsigned char *out, wo
 
 #endif /* WOLFSSL_AESNI */
 
-#if !defined(WOLFSSL_ARMASM) || defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
 #ifdef HAVE_AES_ECB
 /* helper function for encrypting / decrypting full buffer at once */
 static WARN_UNUSED_RESULT int _AesXtsHelper(
@@ -13100,6 +13249,13 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_XTS_encrypt_AARCH64(xaes, out, in, sz, i);
+            ret = 0;
+        }
+        else
 #endif
         {
             ret = AesXtsEncrypt_sw(xaes, out, in, sz, i);
@@ -13533,6 +13689,13 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
             RESTORE_VECTOR_REGISTERS();
         }
         else
+#elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+      !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+        if (aes->use_aes_hw_crypto) {
+            AES_XTS_decrypt_AARCH64(xaes, out, in, sz, i);
+            ret = 0;
+        }
+        else
 #endif
         {
             ret = AesXtsDecrypt_sw(xaes, out, in, sz, i);
@@ -13730,8 +13893,6 @@ int wc_AesXtsDecryptFinal(XtsAes* xaes, byte* out, const byte* in, word32 sz,
 
 #endif /* WOLFSSL_AESXTS_STREAM */
 
-#endif /* !WOLFSSL_ARMASM || WOLFSSL_ARMASM_NO_HW_CRYPTO */
-
 /* Same as wc_AesXtsEncryptSector but the sector gets incremented by one every
  * sectorSz bytes
  *
diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c
index 67223860c8..2e63a092bf 100644
--- a/wolfcrypt/src/cpuid.c
+++ b/wolfcrypt/src/cpuid.c
@@ -28,7 +28,8 @@
 
 #include <wolfssl/wolfcrypt/cpuid.h>
 
-#if defined(HAVE_CPUID) || defined(HAVE_CPUID_INTEL)
+#if defined(HAVE_CPUID) || defined(HAVE_CPUID_INTEL) || \
+    defined(HAVE_CPUID_AARCH64)
     static word32 cpuid_check = 0;
     static word32 cpuid_flags = 0;
 #endif
@@ -101,6 +102,208 @@
             cpuid_check = 1;
         }
     }
+#elif defined(HAVE_CPUID_AARCH64)
+
+#define CPUID_AARCH64_FEAT_AES         ((word64)1 << 4)
+#define CPUID_AARCH64_FEAT_PMULL       ((word64)1 << 5)
+#define CPUID_AARCH64_FEAT_SHA256      ((word64)1 << 12)
+#define CPUID_AARCH64_FEAT_SHA256_512  ((word64)1 << 13)
+#define CPUID_AARCH64_FEAT_RDM         ((word64)1 << 28)
+#define CPUID_AARCH64_FEAT_SHA3        ((word64)1 << 32)
+#define CPUID_AARCH64_FEAT_SM3         ((word64)1 << 36)
+#define CPUID_AARCH64_FEAT_SM4         ((word64)1 << 40)
+
+#ifdef WOLFSSL_AARCH64_PRIVILEGE_MODE
+    /* https://developer.arm.com/documentation/ddi0601/2024-09/AArch64-Registers
+     * /ID-AA64ISAR0-EL1--AArch64-Instruction-Set-Attribute-Register-0 */
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            word64 features;
+
+            __asm__ __volatile (
+                "mrs    %[feat], ID_AA64ISAR0_EL1\n"
+                : [feat] "=r" (features)
+                :
+                :
+            );
+
+            if (features & CPUID_AARCH64_FEAT_AES)
+                cpuid_flags |= CPUID_AES;
+            if (features & CPUID_AARCH64_FEAT_PMULL)
+                cpuid_flags |= CPUID_PMULL;
+            if (features & CPUID_AARCH64_FEAT_SHA256)
+                cpuid_flags |= CPUID_SHA256;
+            if (features & CPUID_AARCH64_FEAT_SHA256_512)
+                cpuid_flags |= CPUID_SHA256 | CPUID_SHA512;
+            if (features & CPUID_AARCH64_FEAT_RDM)
+                cpuid_flags |= CPUID_RDM;
+            if (features & CPUID_AARCH64_FEAT_SHA3)
+                cpuid_flags |= CPUID_SHA3;
+            if (features & CPUID_AARCH64_FEAT_SM3)
+                cpuid_flags |= CPUID_SM3;
+            if (features & CPUID_AARCH64_FEAT_SM4)
+                cpuid_flags |= CPUID_SM4;
+
+            cpuid_check = 1;
+        }
+    }
+#elif defined(__linux__)
+    /* https://community.arm.com/arm-community-blogs/b/operating-systems-blog/
+     * posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu */
+
+    #include <sys/auxv.h>
+    #include <asm/hwcap.h>
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            word64 hwcaps = getauxval(AT_HWCAP);
+
+            if (hwcaps & HWCAP_AES)
+                cpuid_flags |= CPUID_AES;
+            if (hwcaps & HWCAP_PMULL)
+                cpuid_flags |= CPUID_PMULL;
+            if (hwcaps & HWCAP_SHA2)
+                cpuid_flags |= CPUID_SHA256;
+            if (hwcaps & HWCAP_SHA512)
+                cpuid_flags |= CPUID_SHA512;
+            if (hwcaps & HWCAP_ASIMDRDM)
+                cpuid_flags |= CPUID_RDM;
+            if (hwcaps & HWCAP_SHA3)
+                cpuid_flags |= CPUID_SHA3;
+            if (hwcaps & HWCAP_SM3)
+                cpuid_flags |= CPUID_SM3;
+            if (hwcaps & HWCAP_SM4)
+                cpuid_flags |= CPUID_SM4;
+
+            cpuid_check = 1;
+        }
+    }
+#elif defined(__ANDROID__) || defined(ANDROID)
+    /* https://community.arm.com/arm-community-blogs/b/operating-systems-blog/
+     * posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu */
+
+    #include "cpu-features.h"
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            word64 features = android_getCpuFeatures();
+
+            if (features & ANDROID_CPU_ARM_FEATURE_AES)
+                cpuid_flags |= CPUID_AES;
+            if (features & ANDROID_CPU_ARM_FEATURE_PMULL)
+                cpuid_flags |= CPUID_PMULL;
+            if (features & ANDROID_CPU_ARM_FEATURE_SHA2)
+                cpuid_flags |= CPUID_SHA256;
+
+            cpuid_check = 1;
+        }
+    }
+#elif defined(__APPLE__)
+    /* https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/
+     * determining_instruction_set_characteristics */
+
+    #include <sys/sysctl.h>
+
+    static word64 cpuid_get_sysctlbyname(const char* name)
+    {
+        word64 ret = 0;
+        size_t size = sizeof(ret);
+
+        sysctlbyname(name, &ret, &size, NULL, 0);
+
+        return ret;
+    }
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_AES") != 0)
+                cpuid_flags |= CPUID_AES;
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_PMULL") != 0)
+                cpuid_flags |= CPUID_PMULL;
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA256") != 0)
+                cpuid_flags |= CPUID_SHA256;
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA512") != 0)
+                cpuid_flags |= CPUID_SHA512;
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_RDM") != 0)
+                cpuid_flags |= CPUID_RDM;
+            if (cpuid_get_sysctlbyname("hw.optional.arm.FEAT_SHA3") != 0)
+                cpuid_flags |= CPUID_SHA3;
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SM3
+            cpuid_flags |= CPUID_SM3;
+        #endif
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SM4
+            cpuid_flags |= CPUID_SM4;
+        #endif
+
+            cpuid_check = 1;
+        }
+    }
+#elif defined(__FreeBSD__) || defined(__OpenBSD__)
+    /* https://man.freebsd.org/cgi/man.cgi?elf_aux_info(3) */
+
+    #include <sys/auxv.h>
+
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+            word64 features = 0;
+
+            elf_aux_info(AT_HWCAP, &features, sizeof(features));
+
+            if (features & CPUID_AARCH64_FEAT_AES)
+                cpuid_flags |= CPUID_AES;
+            if (features & CPUID_AARCH64_FEAT_PMULL)
+                cpuid_flags |= CPUID_PMULL;
+            if (features & CPUID_AARCH64_FEAT_SHA256)
+                cpuid_flags |= CPUID_SHA256;
+            if (features & CPUID_AARCH64_FEAT_SHA256_512)
+                cpuid_flags |= CPUID_SHA256 | CPUID_SHA512;
+            if (features & CPUID_AARCH64_FEAT_RDM)
+                cpuid_flags |= CPUID_RDM;
+            if (features & CPUID_AARCH64_FEAT_SHA3)
+                cpuid_flags |= CPUID_SHA3;
+            if (features & CPUID_AARCH64_FEAT_SM3)
+                cpuid_flags |= CPUID_SM3;
+            if (features & CPUID_AARCH64_FEAT_SM4)
+                cpuid_flags |= CPUID_SM4;
+
+            cpuid_check = 1;
+        }
+    }
+#else
+    void cpuid_set_flags(void)
+    {
+        if (!cpuid_check) {
+
+        #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
+            cpuid_flags |= CPUID_AES;
+            cpuid_flags |= CPUID_PMULL;
+            cpuid_flags |= CPUID_SHA256;
+        #endif
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512
+            cpuid_flags |= CPUID_SHA512;
+        #endif
+        #ifndef WOLFSSL_AARCH64_NO_SQRMLSH
+            cpuid_flags |= CPUID_RDM;
+        #endif
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3
+            cpuid_flags |= CPUID_SHA3;
+        #endif
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SM3
+            cpuid_flags |= CPUID_SM3;
+        #endif
+        #ifdef WOLFSSL_ARMASM_CRYPTO_SM4
+            cpuid_flags |= CPUID_SM4;
+        #endif
+            cpuid_check = 1;
+        }
+    }
+#endif
 #elif defined(HAVE_CPUID)
     void cpuid_set_flags(void)
     {
diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c
index 0eca6775e8..9ae90e8cfa 100644
--- a/wolfcrypt/src/port/arm/armv8-aes.c
+++ b/wolfcrypt/src/port/arm/armv8-aes.c
@@ -175,48 +175,20 @@ static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz)
 
 #endif /* HAVE_AESGCM */
 
-/* Similar to wolfSSL software implementation of expanding the AES key.
- * Changed out the locations of where table look ups where made to
- * use hardware instruction. Also altered decryption key to match. */
-int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
-            const byte* iv, int dir)
+int AES_set_key_AARCH64(const unsigned char *userKey, const int keylen,
+    Aes* aes, int dir)
 {
     word32 temp;
-    word32 *rk;
+    word32* rk = aes->key;
     unsigned int i = 0;
 
-#if defined(AES_MAX_KEY_SIZE)
-    const word32 max_key_len = (AES_MAX_KEY_SIZE / 8);
-#endif
-
-    if (!((keylen == 16) || (keylen == 24) || (keylen == 32)) ||
-           aes == NULL || userKey == NULL)
-        return BAD_FUNC_ARG;
-
-    rk = aes->key;
-#if defined(AES_MAX_KEY_SIZE)
-    /* Check key length */
-    if (keylen > max_key_len) {
-        return BAD_FUNC_ARG;
-    }
-#endif
-
-    #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB) || \
-        defined(WOLFSSL_AES_OFB) || defined(WOLFSSL_AES_XTS)
-        aes->left = 0;
-    #endif /* WOLFSSL_AES_COUNTER */
-
-    aes->keylen = keylen;
-    aes->rounds = keylen/4 + 6;
     XMEMCPY(rk, userKey, keylen);
 
-    switch(keylen)
-    {
+    switch (keylen) {
 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \
         defined(WOLFSSL_AES_128)
     case 16:
-        while (1)
-        {
+        while (1) {
             temp  = rk[3];
             SBOX(temp);
             temp = rotrFixed(temp, 8);
@@ -235,8 +207,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
         defined(WOLFSSL_AES_192)
     case 24:
         /* for (;;) here triggers a bug in VC60 SP4 w/ Pro Pack */
-        while (1)
-        {
+        while (1) {
             temp  = rk[5];
             SBOX(temp);
             temp = rotrFixed(temp, 8);
@@ -256,8 +227,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
 #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 256 && \
         defined(WOLFSSL_AES_256)
     case 32:
-        while (1)
-        {
+        while (1) {
             temp  = rk[7];
             SBOX(temp);
             temp = rotrFixed(temp, 8);
@@ -283,8 +253,7 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
         return BAD_FUNC_ARG;
     }
 
-    if (dir == AES_DECRYPTION)
-    {
+    if (dir == AES_DECRYPTION) {
 #ifdef HAVE_AES_DECRYPT
         unsigned int j;
         rk = aes->key;
@@ -308,9 +277,10 @@ int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
 #endif /* HAVE_AES_DECRYPT */
     }
 
-    return wc_AesSetIV(aes, iv);
+    return 0;
 }
 
+#ifndef __aarch64__
 #if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
     int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
                         const byte* iv, int dir)
@@ -332,587 +302,521 @@ int wc_AesSetIV(Aes* aes, const byte* iv)
 
     return 0;
 }
-
+#endif
 
 #ifdef __aarch64__
 /* AES CCM/GCM use encrypt direct but not decrypt */
 #if defined(HAVE_AESCCM) || defined(HAVE_AESGCM) || \
-    defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
-    {
-            word32* keyPt = aes->key;
+    defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER) || \
+    defined(HAVE_AES_CBC)
 
-            /*
-              AESE exor's input with round key
-                   shift rows of exor'ed result
-                   sub bytes for shifted rows
-             */
+void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
+{
+    /*
+      AESE exor's input with round key
+           shift rows of exor'ed result
+           sub bytes for shifted rows
+     */
 
-            __asm__ __volatile__ (
-                "LD1 {v0.16b}, [%[CtrIn]] \n"
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-
-                "#subtract rounds done so far and see if should continue\n"
-                "MOV w12, %w[R]    \n"
-                "SUB w12, w12, #10 \n"
-                "CBZ w12, 1f       \n"
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-
-                "SUB w12, w12, #2 \n"
-                "CBZ w12, 1f      \n"
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
+    __asm__ __volatile__ (
+        "LD1 {v0.16b}, [%[in]] \n"
+        "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
 
-                "#Final AddRoundKey then store result \n"
-                "1: \n"
-                "LD1 {v1.2d}, [%[Key]], #16 \n"
-                "EOR v0.16b, v0.16b, v1.16b  \n"
-                "ST1 {v0.16b}, [%[CtrOut]]   \n"
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
 
-                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
-                 "=r" (inBlock)
-                :"0" (outBlock), [Key] "1" (keyPt), [R] "2" (aes->rounds),
-                 [CtrIn] "3" (inBlock)
-                : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
-            );
+        "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v2.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v3.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v4.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
 
-        return 0;
-    }
-#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
-#if defined(WOLFSSL_AES_DIRECT) || defined(WOLFSSL_AES_COUNTER)
-    #ifdef HAVE_AES_DECRYPT
-    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
-    {
-            word32* keyPt = aes->key;
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v2.16b  \n"
 
-            /*
-              AESE exor's input with round key
-                   shift rows of exor'ed result
-                   sub bytes for shifted rows
-             */
+        "#subtract rounds done so far and see if should continue\n"
+        "MOV w12, %w[nr]    \n"
+        "SUB w12, w12, #10 \n"
+        "CBZ w12, 1f       \n"
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v2.16b  \n"
 
-            __asm__ __volatile__ (
-                "LD1 {v0.16b}, [%[CtrIn]] \n"
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-
-                "#subtract rounds done so far and see if should continue\n"
-                "MOV w12, %w[R]    \n"
-                "SUB w12, w12, #10 \n"
-                "CBZ w12, 1f       \n"
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-
-                "SUB w12, w12, #2  \n"
-                "CBZ w12, 1f       \n"
-                "LD1 {v1.2d-v2.2d}, [%[Key]], #32  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
+        "SUB w12, w12, #2 \n"
+        "CBZ w12, 1f      \n"
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v1.16b  \n"
+        "AESMC v0.16b, v0.16b \n"
+        "AESE v0.16b, v2.16b  \n"
 
-                "#Final AddRoundKey then store result \n"
-                "1: \n"
-                "LD1 {v1.2d}, [%[Key]], #16 \n"
-                "EOR v0.16b, v0.16b, v1.16b  \n"
-                "ST1 {v0.4s}, [%[CtrOut]]    \n"
+        "#Final AddRoundKey then store result \n"
+    "1: \n"
+        "LD1 {v1.2d}, [%[key]], #16 \n"
+        "EOR v0.16b, v0.16b, v1.16b  \n"
+        "ST1 {v0.16b}, [%[out]]   \n"
 
-                :[CtrOut] "=r" (outBlock), "=r" (keyPt), "=r" (aes->rounds),
-                 "=r" (inBlock)
-                :[Key] "1" (aes->key), "0" (outBlock), [R] "2" (aes->rounds),
-                 [CtrIn] "3" (inBlock)
-                : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
-            );
+        : [key] "+r" (key)
+        : [in] "r" (inBlock), [out] "r" (outBlock), [nr] "r" (nr)
+        : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+    );
+}
+#endif /* AES_GCM, AES_CCM, DIRECT or COUNTER */
+#if !defined(WC_AES_BITSLICED) || defined(WOLFSSL_AES_DIRECT) || \
+    defined(WOLFSSL_AES_COUNTER)
+#ifdef HAVE_AES_DECRYPT
+void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock, byte* key, int nr)
+{
+    /*
+      AESE exor's input with round key
+           shift rows of exor'ed result
+           sub bytes for shifted rows
+     */
 
-        return 0;
+    __asm__ __volatile__ (
+        "LD1 {v0.16b}, [%[in]] \n"
+        "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+
+        "AESD v0.16b, v1.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v2.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v3.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v4.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+
+        "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+        "AESD v0.16b, v1.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v2.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v3.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v4.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESD v0.16b, v1.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v2.16b   \n"
+
+        "#subtract rounds done so far and see if should continue\n"
+        "MOV w12, %w[nr]    \n"
+        "SUB w12, w12, #10 \n"
+        "CBZ w12, 1f       \n"
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v1.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v2.16b   \n"
+
+        "SUB w12, w12, #2  \n"
+        "CBZ w12, 1f       \n"
+        "LD1 {v1.2d-v2.2d}, [%[key]], #32  \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v1.16b   \n"
+        "AESIMC v0.16b, v0.16b \n"
+        "AESD v0.16b, v2.16b   \n"
+
+        "#Final AddRoundKey then store result \n"
+    "1: \n"
+        "LD1 {v1.2d}, [%[key]], #16 \n"
+        "EOR v0.16b, v0.16b, v1.16b  \n"
+        "ST1 {v0.4s}, [%[out]]    \n"
+
+        : [key] "+r" (key)
+        : [in] "r" (inBlock), [out] "r" (outBlock), [nr] "r" (nr)
+        : "cc", "memory", "w12", "v0", "v1", "v2", "v3", "v4"
+    );
 }
-    #endif /* HAVE_AES_DECRYPT */
+#endif /* HAVE_AES_DECRYPT */
 #endif /* DIRECT or COUNTER */
 
 /* AES-CBC */
 #ifdef HAVE_AES_CBC
-    int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
-    {
-        word32 numBlocks = sz / AES_BLOCK_SIZE;
-
-        if (aes == NULL || out == NULL || in == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
-        if (sz == 0) {
-            return 0;
-        }
-
-#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS
-        if (sz % AES_BLOCK_SIZE) {
-            return BAD_LENGTH_E;
-        }
-#endif
+void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz, byte* reg,
+    byte* key, int rounds)
+{
+    word32 numBlocks = sz / AES_BLOCK_SIZE;
 
-        /* do as many block size ops as possible */
-        if (numBlocks > 0) {
-            word32* key = aes->key;
-            word32* reg = aes->reg;
-            /*
-            AESE exor's input with round key
-            shift rows of exor'ed result
+    /*
+    AESE exor's input with round key
+    shift rows of exor'ed result
             sub bytes for shifted rows
 
-            note: grouping AESE & AESMC together as pairs reduces latency
-            */
-            switch(aes->rounds) {
+    note: grouping AESE & AESMC together as pairs reduces latency
+    */
+    switch (rounds) {
 #ifdef WOLFSSL_AES_128
-            case 10: /* AES 128 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
-                "LD1 {v9.2d-v11.2d},[%[Key]], #48  \n"
-                "LD1 {v0.2d}, [%[reg]] \n"
-
-                "LD1 {v12.2d}, [%[input]], #16 \n"
-                "1:\n"
-                "#CBC operations, xorbuf in with current aes->reg \n"
-                "EOR v0.16b, v0.16b, v12.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v5.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v6.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v7.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v8.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v9.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v10.16b  \n"
-                "SUB w11, w11, #1 \n"
-                "EOR v0.16b, v0.16b, v11.16b  \n"
-                "ST1 {v0.2d}, [%[out]], #16   \n"
-
-                "CBZ w11, 2f \n"
-                "LD1 {v12.2d}, [%[input]], #16 \n"
-                "B 1b \n"
-
-                "2:\n"
-                "#store current counter value at the end \n"
-                "ST1 {v0.2d}, [%[regOut]] \n"
-
-                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
-                :"0" (out), [Key] "r" (key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "1" (reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
-                );
-                break;
+    case 10: /* AES 128 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+            "LD1 {v5.2d-v8.2d}, [%[key]], #64  \n"
+            "LD1 {v9.2d-v11.2d},[%[key]], #48  \n"
+            "LD1 {v0.2d}, [%[reg]] \n"
+
+            "LD1 {v12.2d}, [%[in]], #16 \n"
+        "1:\n"
+            "#CBC operations, xorbuf in with current reg \n"
+            "EOR v0.16b, v0.16b, v12.16b \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b  \n"
+            "SUB w11, w11, #1 \n"
+            "EOR v0.16b, v0.16b, v11.16b  \n"
+            "ST1 {v0.2d}, [%[out]], #16   \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v12.2d}, [%[in]], #16 \n"
+            "B 1b \n"
+
+        "2:\n"
+            "#store current counter value at the end \n"
+            "ST1 {v0.2d}, [%[reg]] \n"
+
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+        );
+        break;
 #endif /* WOLFSSL_AES_128 */
 #ifdef WOLFSSL_AES_192
-            case 12: /* AES 192 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d}, %[Key], #64  \n"
-                "LD1 {v5.2d-v8.2d}, %[Key], #64  \n"
-                "LD1 {v9.2d-v12.2d},%[Key], #64  \n"
-                "LD1 {v13.2d}, %[Key], #16 \n"
-                "LD1 {v0.2d}, %[reg] \n"
-
-                "LD1 {v14.2d}, [%[input]], #16  \n"
-                "1:\n"
-                "#CBC operations, xorbuf in with current aes->reg \n"
-                "EOR v0.16b, v0.16b, v14.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v5.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v6.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v7.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v8.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v9.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v10.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v11.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v12.16b \n"
-                "EOR v0.16b, v0.16b, v13.16b  \n"
-                "SUB w11, w11, #1 \n"
-                "ST1 {v0.2d}, [%[out]], #16  \n"
-
-                "CBZ w11, 2f \n"
-                "LD1 {v14.2d}, [%[input]], #16\n"
-                "B 1b \n"
-
-                "2:\n"
-                "#store current counter value at the end \n"
-                "ST1 {v0.2d}, %[regOut]   \n"
-
-
-                :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
-                :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
-                );
-                break;
+    case 12: /* AES 192 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+            "LD1 {v5.2d-v8.2d}, [%[key]], #64  \n"
+            "LD1 {v9.2d-v12.2d},[%[key]], #64  \n"
+            "LD1 {v13.2d}, [%[key]], #16 \n"
+            "LD1 {v0.2d}, [%[reg]] \n"
+
+            "LD1 {v14.2d}, [%[in]], #16  \n"
+        "1:\n"
+            "#CBC operations, xorbuf in with current reg \n"
+            "EOR v0.16b, v0.16b, v14.16b \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v12.16b \n"
+            "EOR v0.16b, v0.16b, v13.16b  \n"
+            "SUB w11, w11, #1 \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v14.2d}, [%[in]], #16\n"
+            "B 1b \n"
+
+        "2:\n"
+            "#store current counter value at the end \n"
+            "ST1 {v0.2d}, [%[reg]]   \n"
+
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
+        );
+        break;
 #endif /* WOLFSSL_AES_192*/
 #ifdef WOLFSSL_AES_256
-            case 14: /* AES 256 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d},   %[Key], #64 \n"
-
-                "LD1 {v5.2d-v8.2d},   %[Key], #64 \n"
-                "LD1 {v9.2d-v12.2d},  %[Key], #64 \n"
-                "LD1 {v13.2d-v15.2d}, %[Key], #48 \n"
-                "LD1 {v0.2d}, %[reg] \n"
-
-                "LD1 {v16.2d}, [%[input]], #16  \n"
-                "1: \n"
-                "#CBC operations, xorbuf in with current aes->reg \n"
-                "EOR v0.16b, v0.16b, v16.16b \n"
-                "AESE v0.16b, v1.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v2.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v3.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v4.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v5.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v6.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v7.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v8.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v9.16b  \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v10.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v11.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v12.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v13.16b \n"
-                "AESMC v0.16b, v0.16b \n"
-                "AESE v0.16b, v14.16b \n"
-                "EOR v0.16b, v0.16b, v15.16b \n"
-                "SUB w11, w11, #1     \n"
-                "ST1 {v0.2d}, [%[out]], #16  \n"
-
-                "CBZ w11, 2f \n"
-                "LD1 {v16.2d}, [%[input]], #16 \n"
-                "B 1b \n"
+    case 14: /* AES 256 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d},   [%[key]], #64 \n"
 
-                "2: \n"
-                "#store current counter value at the end \n"
-                "ST1 {v0.2d}, %[regOut]   \n"
+            "LD1 {v5.2d-v8.2d},   [%[key]], #64 \n"
+            "LD1 {v9.2d-v12.2d},  [%[key]], #64 \n"
+            "LD1 {v13.2d-v15.2d}, [%[key]], #48 \n"
+            "LD1 {v0.2d}, [%[reg]] \n"
 
+            "LD1 {v16.2d}, [%[in]], #16  \n"
+        "1: \n"
+            "#CBC operations, xorbuf in with current reg \n"
+            "EOR v0.16b, v0.16b, v16.16b \n"
+            "AESE v0.16b, v1.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v2.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v3.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v4.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v5.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v6.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v7.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v8.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v9.16b  \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v10.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v11.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v12.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v13.16b \n"
+            "AESMC v0.16b, v0.16b \n"
+            "AESE v0.16b, v14.16b \n"
+            "EOR v0.16b, v0.16b, v15.16b \n"
+            "SUB w11, w11, #1     \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v16.2d}, [%[in]], #16 \n"
+            "B 1b \n"
 
-                :[out] "=r" (out), [regOut] "=m" (aes->reg), "=r" (in)
-                :"0" (out), [Key] "m" (aes->key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "m" (aes->reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
-                "v16"
-                );
-                break;
+        "2: \n"
+            "#store current counter value at the end \n"
+            "ST1 {v0.2d}, [%[reg]]   \n"
+
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+            "v16"
+        );
+        break;
 #endif /* WOLFSSL_AES_256 */
-            default:
-                WOLFSSL_MSG("Bad AES-CBC round value");
-                return BAD_FUNC_ARG;
-            }
-        }
-
-        return 0;
     }
+}
 
-    #ifdef HAVE_AES_DECRYPT
-    int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
-    {
-        word32 numBlocks = sz / AES_BLOCK_SIZE;
-
-        if (aes == NULL || out == NULL || in == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
-        if (sz == 0) {
-            return 0;
-        }
-
-        if (sz % AES_BLOCK_SIZE) {
-#ifdef WOLFSSL_AES_CBC_LENGTH_CHECKS
-            return BAD_LENGTH_E;
-#else
-            return BAD_FUNC_ARG;
-#endif
-        }
-
-        /* do as many block size ops as possible */
-        if (numBlocks > 0) {
-            word32* key = aes->key;
-            word32* reg = aes->reg;
+#ifdef HAVE_AES_DECRYPT
+void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
+    byte* reg, byte* key, int rounds)
+{
+    word32 numBlocks = sz / AES_BLOCK_SIZE;
 
-            switch(aes->rounds) {
+    switch (rounds) {
 #ifdef WOLFSSL_AES_128
-            case 10: /* AES 128 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
-                "LD1 {v9.2d-v11.2d},[%[Key]], #48  \n"
-                "LD1 {v13.2d}, [%[reg]] \n"
-
-                "1:\n"
-                "LD1 {v0.2d}, [%[input]], #16  \n"
-                "MOV v12.16b, v0.16b \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v5.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v6.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v7.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v8.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v9.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v10.16b  \n"
-                "EOR v0.16b, v0.16b, v11.16b \n"
-
-                "EOR v0.16b, v0.16b, v13.16b \n"
-                "SUB w11, w11, #1            \n"
-                "ST1 {v0.2d}, [%[out]], #16  \n"
-                "MOV v13.16b, v12.16b        \n"
-
-                "CBZ w11, 2f \n"
-                "B 1b      \n"
+    case 10: /* AES 128 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+            "LD1 {v5.2d-v8.2d}, [%[key]], #64  \n"
+            "LD1 {v9.2d-v11.2d},[%[key]], #48  \n"
+            "LD1 {v13.2d}, [%[reg]] \n"
+
+        "1:\n"
+            "LD1 {v0.2d}, [%[in]], #16  \n"
+            "MOV v12.16b, v0.16b \n"
+            "AESD v0.16b, v1.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v2.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v3.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v4.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v5.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v6.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v7.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v8.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v9.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v10.16b  \n"
+            "EOR v0.16b, v0.16b, v11.16b \n"
+
+            "EOR v0.16b, v0.16b, v13.16b \n"
+            "SUB w11, w11, #1            \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v13.16b, v12.16b        \n"
+
+            "CBZ w11, 2f \n"
+            "B 1b      \n"
 
-                "2: \n"
-                "#store current counter value at the end \n"
-                "ST1 {v13.2d}, [%[regOut]] \n"
+        "2: \n"
+            "#store current counter value at the end \n"
+            "ST1 {v13.2d}, [%[reg]] \n"
 
-                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
-                :"0" (out), [Key] "r" (key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "1" (reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
-                );
-                break;
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"
+        );
+        break;
 #endif /* WOLFSSL_AES_128 */
 #ifdef WOLFSSL_AES_192
-            case 12: /* AES 192 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d}, [%[Key]], #64  \n"
-                "LD1 {v5.2d-v8.2d}, [%[Key]], #64  \n"
-                "LD1 {v9.2d-v12.2d},[%[Key]], #64  \n"
-                "LD1 {v13.16b}, [%[Key]], #16 \n"
-                "LD1 {v15.2d}, [%[reg]]       \n"
-
-                "LD1 {v0.2d}, [%[input]], #16  \n"
-                "1:    \n"
-                "MOV v14.16b, v0.16b   \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v5.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v6.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v7.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v8.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v9.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v10.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v11.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v12.16b  \n"
-                "EOR v0.16b, v0.16b, v13.16b \n"
-
-                "EOR v0.16b, v0.16b, v15.16b \n"
-                "SUB w11, w11, #1            \n"
-                "ST1 {v0.2d}, [%[out]], #16  \n"
-                "MOV v15.16b, v14.16b        \n"
-
-                "CBZ w11, 2f \n"
-                "LD1 {v0.2d}, [%[input]], #16 \n"
-                "B 1b \n"
-
-                "2:\n"
-                "#store current counter value at the end \n"
-                "ST1 {v15.2d}, [%[regOut]] \n"
-
-                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
-                :"0" (out), [Key] "r" (key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "1" (reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
-                );
-                break;
+    case 12: /* AES 192 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d}, [%[key]], #64  \n"
+            "LD1 {v5.2d-v8.2d}, [%[key]], #64  \n"
+            "LD1 {v9.2d-v12.2d},[%[key]], #64  \n"
+            "LD1 {v13.16b}, [%[key]], #16 \n"
+            "LD1 {v15.2d}, [%[reg]]       \n"
+
+            "LD1 {v0.2d}, [%[in]], #16  \n"
+        "1:    \n"
+            "MOV v14.16b, v0.16b   \n"
+            "AESD v0.16b, v1.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v2.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v3.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v4.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v5.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v6.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v7.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v8.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v9.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v10.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v11.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v12.16b  \n"
+            "EOR v0.16b, v0.16b, v13.16b \n"
+
+            "EOR v0.16b, v0.16b, v15.16b \n"
+            "SUB w11, w11, #1            \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v15.16b, v14.16b        \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v0.2d}, [%[in]], #16 \n"
+            "B 1b \n"
+
+        "2:\n"
+            "#store current counter value at the end \n"
+            "ST1 {v15.2d}, [%[reg]] \n"
+
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+        );
+        break;
 #endif /* WOLFSSL_AES_192 */
 #ifdef WOLFSSL_AES_256
-            case 14: /* AES 256 BLOCK */
-                __asm__ __volatile__ (
-                "MOV w11, %w[blocks] \n"
-                "LD1 {v1.2d-v4.2d},   [%[Key]], #64  \n"
-                "LD1 {v5.2d-v8.2d},   [%[Key]], #64  \n"
-                "LD1 {v9.2d-v12.2d},  [%[Key]], #64  \n"
-                "LD1 {v13.2d-v15.2d}, [%[Key]], #48  \n"
-                "LD1 {v17.2d}, [%[reg]] \n"
-
-                "LD1 {v0.2d}, [%[input]], #16  \n"
-                "1:    \n"
-                "MOV v16.16b, v0.16b   \n"
-                "AESD v0.16b, v1.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v2.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v3.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v4.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v5.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v6.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v7.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v8.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v9.16b   \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v10.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v11.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v12.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v13.16b  \n"
-                "AESIMC v0.16b, v0.16b \n"
-                "AESD v0.16b, v14.16b  \n"
-                "EOR v0.16b, v0.16b, v15.16b \n"
-
-                "EOR v0.16b, v0.16b, v17.16b \n"
-                "SUB w11, w11, #1            \n"
-                "ST1 {v0.2d}, [%[out]], #16  \n"
-                "MOV v17.16b, v16.16b        \n"
-
-                "CBZ w11, 2f \n"
-                "LD1 {v0.2d}, [%[input]], #16  \n"
-                "B 1b \n"
-
-                "2:\n"
-                "#store current counter value at the end \n"
-                "ST1 {v17.2d}, [%[regOut]]   \n"
-
-                :[out] "=r" (out), [regOut] "=r" (reg), "=r" (in)
-                :"0" (out), [Key] "r" (key), [input] "2" (in),
-                 [blocks] "r" (numBlocks), [reg] "1" (reg)
-                : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
-                "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
-                "v16", "v17"
-                );
-                break;
+    case 14: /* AES 256 BLOCK */
+        __asm__ __volatile__ (
+            "MOV w11, %w[blocks] \n"
+            "LD1 {v1.2d-v4.2d},   [%[key]], #64  \n"
+            "LD1 {v5.2d-v8.2d},   [%[key]], #64  \n"
+            "LD1 {v9.2d-v12.2d},  [%[key]], #64  \n"
+            "LD1 {v13.2d-v15.2d}, [%[key]], #48  \n"
+            "LD1 {v17.2d}, [%[reg]] \n"
+
+            "LD1 {v0.2d}, [%[in]], #16  \n"
+        "1:    \n"
+            "MOV v16.16b, v0.16b   \n"
+            "AESD v0.16b, v1.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v2.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v3.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v4.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v5.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v6.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v7.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v8.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v9.16b   \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v10.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v11.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v12.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v13.16b  \n"
+            "AESIMC v0.16b, v0.16b \n"
+            "AESD v0.16b, v14.16b  \n"
+            "EOR v0.16b, v0.16b, v15.16b \n"
+
+            "EOR v0.16b, v0.16b, v17.16b \n"
+            "SUB w11, w11, #1            \n"
+            "ST1 {v0.2d}, [%[out]], #16  \n"
+            "MOV v17.16b, v16.16b        \n"
+
+            "CBZ w11, 2f \n"
+            "LD1 {v0.2d}, [%[in]], #16  \n"
+            "B 1b \n"
+
+        "2:\n"
+            "#store current counter value at the end \n"
+            "ST1 {v17.2d}, [%[reg]]   \n"
+
+            : [out] "+r" (out), [in] "+r" (in), [key] "+r" (key)
+            : [reg] "r" (reg), [blocks] "r" (numBlocks)
+            : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5",
+            "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14","v15",
+            "v16", "v17"
+        );
+        break;
 #endif /* WOLFSSL_AES_256 */
-            default:
-                WOLFSSL_MSG("Bad AES-CBC round value");
-                return BAD_FUNC_ARG;
-            }
-        }
-
-        return 0;
     }
-    #endif
+}
+#endif
 
 #endif /* HAVE_AES_CBC */
 
@@ -1420,40 +1324,11 @@ static void wc_aes_ctr_encrypt_asm(Aes* aes, byte* out, const byte* in,
     }
 }
 
-int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+void AES_CTR_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz)
 {
     byte* tmp;
     word32 numBlocks;
 
-    if (aes == NULL || out == NULL || in == NULL) {
-        return BAD_FUNC_ARG;
-    }
-    switch(aes->rounds) {
-    #ifdef WOLFSSL_AES_128
-        case 10: /* AES 128 BLOCK */
-    #endif /* WOLFSSL_AES_128 */
-    #ifdef WOLFSSL_AES_192
-        case 12: /* AES 192 BLOCK */
-    #endif /* WOLFSSL_AES_192 */
-    #ifdef WOLFSSL_AES_256
-        case 14: /* AES 256 BLOCK */
-    #endif /* WOLFSSL_AES_256 */
-            break;
-        default:
-            WOLFSSL_MSG("Bad AES-CTR round value");
-            return BAD_FUNC_ARG;
-    }
-
-
-    tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
-
-    /* consume any unused bytes left in aes->tmp */
-    while ((aes->left != 0) && (sz != 0)) {
-       *(out++) = *(in++) ^ *(tmp++);
-       aes->left--;
-       sz--;
-    }
-
     /* do as many block size ops as possible */
     numBlocks = sz / AES_BLOCK_SIZE;
     if (numBlocks > 0) {
@@ -1478,14 +1353,6 @@ int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
             aes->left--;
         }
     }
-    return 0;
-}
-
-int wc_AesCtrSetKey(Aes* aes, const byte* key, word32 len,
-        const byte* iv, int dir)
-{
-    (void)dir;
-    return wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION);
 }
 
 #endif /* WOLFSSL_AES_COUNTER */
@@ -1500,7 +1367,7 @@ int wc_AesCtrSetKey(Aes* aes, const byte* key, word32 len,
 
 /* PMULL and RBIT only with AArch64 */
 /* Use ARM hardware for polynomial multiply */
-void GMULT(byte* X, byte* Y)
+void GMULT_AARCH64(byte* X, byte* Y)
 {
     __asm__ volatile (
         "LD1 {v0.16b}, [%[X]] \n"
@@ -1532,7 +1399,7 @@ void GMULT(byte* X, byte* Y)
     );
 }
 
-void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
+static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
     word32 cSz, byte* s, word32 sSz)
 {
     byte scratch[AES_BLOCK_SIZE];
@@ -1899,12 +1766,291 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
           "v8", "v9", "v10", "v11", "v12", "v13", "v14"
     );
 
-    XMEMCPY(s, scratch, sSz);
+    XMEMCPY(s, scratch, sSz);
+}
+
+#ifdef WOLFSSL_AESGCM_STREAM
+    /* Access initialization counter data. */
+    #define AES_INITCTR(aes)        ((aes)->streamData + 0 * AES_BLOCK_SIZE)
+    /* Access counter data. */
+    #define AES_COUNTER(aes)        ((aes)->streamData + 1 * AES_BLOCK_SIZE)
+    /* Access tag data. */
+    #define AES_TAG(aes)            ((aes)->streamData + 2 * AES_BLOCK_SIZE)
+    /* Access last GHASH block. */
+    #define AES_LASTGBLOCK(aes)     ((aes)->streamData + 3 * AES_BLOCK_SIZE)
+    /* Access last encrypted block. */
+    #define AES_LASTBLOCK(aes)      ((aes)->streamData + 4 * AES_BLOCK_SIZE)
+
+/* GHASH one block of data.
+ *
+ * XOR block into tag and GMULT with H.
+ *
+ * @param [in, out] aes    AES GCM object.
+ * @param [in]      block  Block of AAD or cipher text.
+ */
+#define GHASH_ONE_BLOCK_AARCH64(aes, block)             \
+    do {                                                \
+        xorbuf(AES_TAG(aes), block, AES_BLOCK_SIZE);    \
+        GMULT_AARCH64(AES_TAG(aes), aes->gcm.H);        \
+    }                                                   \
+    while (0)
+
+/* Hash in the lengths of the AAD and cipher text in bits.
+ *
+ * Default implementation.
+ *
+ * @param [in, out] aes  AES GCM object.
+ */
+#define GHASH_LEN_BLOCK_AARCH64(aes)            \
+    do {                                        \
+        byte scratch[AES_BLOCK_SIZE];           \
+        FlattenSzInBits(&scratch[0], aes->aSz); \
+        FlattenSzInBits(&scratch[8], aes->cSz); \
+        GHASH_ONE_BLOCK_AARCH64(aes, scratch);  \
+    }                                           \
+    while (0)
+
+/* Update the GHASH with AAD and/or cipher text.
+ *
+ * @param [in,out] aes   AES GCM object.
+ * @param [in]     a     Additional authentication data buffer.
+ * @param [in]     aSz   Size of data in AAD buffer.
+ * @param [in]     c     Cipher text buffer.
+ * @param [in]     cSz   Size of data in cipher text buffer.
+ */
+void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz, const byte* c,
+    word32 cSz)
+{
+    word32 blocks;
+    word32 partial;
+
+    /* Hash in A, the Additional Authentication Data */
+    if (aSz != 0 && a != NULL) {
+        /* Update count of AAD we have hashed. */
+        aes->aSz += aSz;
+        /* Check if we have unprocessed data. */
+        if (aes->aOver > 0) {
+            /* Calculate amount we can use - fill up the block. */
+            byte sz = AES_BLOCK_SIZE - aes->aOver;
+            if (sz > aSz) {
+                sz = aSz;
+            }
+            /* Copy extra into last GHASH block array and update count. */
+            XMEMCPY(AES_LASTGBLOCK(aes) + aes->aOver, a, sz);
+            aes->aOver += sz;
+            if (aes->aOver == AES_BLOCK_SIZE) {
+                /* We have filled up the block and can process. */
+                GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes));
+                /* Reset count. */
+                aes->aOver = 0;
+            }
+            /* Used up some data. */
+            aSz -= sz;
+            a += sz;
+        }
+
+        /* Calculate number of blocks of AAD and the leftover. */
+        blocks = aSz / AES_BLOCK_SIZE;
+        partial = aSz % AES_BLOCK_SIZE;
+        /* GHASH full blocks now. */
+        while (blocks--) {
+            GHASH_ONE_BLOCK_AARCH64(aes, a);
+            a += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            /* Cache the partial block. */
+            XMEMCPY(AES_LASTGBLOCK(aes), a, partial);
+            aes->aOver = (byte)partial;
+        }
+    }
+    if (aes->aOver > 0 && cSz > 0 && c != NULL) {
+        /* No more AAD coming and we have a partial block. */
+        /* Fill the rest of the block with zeros. */
+        byte sz = AES_BLOCK_SIZE - aes->aOver;
+        XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, sz);
+        /* GHASH last AAD block. */
+        GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes));
+        /* Clear partial count for next time through. */
+        aes->aOver = 0;
+    }
+
+    /* Hash in C, the Ciphertext */
+    if (cSz != 0 && c != NULL) {
+        /* Update count of cipher text we have hashed. */
+        aes->cSz += cSz;
+        if (aes->cOver > 0) {
+            /* Calculate amount we can use - fill up the block. */
+            byte sz = AES_BLOCK_SIZE - aes->cOver;
+            if (sz > cSz) {
+                sz = cSz;
+            }
+            XMEMCPY(AES_LASTGBLOCK(aes) + aes->cOver, c, sz);
+            /* Update count of unused encrypted counter. */
+            aes->cOver += sz;
+            if (aes->cOver == AES_BLOCK_SIZE) {
+                /* We have filled up the block and can process. */
+                GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes));
+                /* Reset count. */
+                aes->cOver = 0;
+            }
+            /* Used up some data. */
+            cSz -= sz;
+            c += sz;
+        }
+
+        /* Calculate number of blocks of cipher text and the leftover. */
+        blocks = cSz / AES_BLOCK_SIZE;
+        partial = cSz % AES_BLOCK_SIZE;
+        /* GHASH full blocks now. */
+        while (blocks--) {
+            GHASH_ONE_BLOCK_AARCH64(aes, c);
+            c += AES_BLOCK_SIZE;
+        }
+        if (partial != 0) {
+            /* Cache the partial block. */
+            XMEMCPY(AES_LASTGBLOCK(aes), c, partial);
+            aes->cOver = (byte)partial;
+        }
+    }
+}
+
+/* Finalize the GHASH calculation.
+ *
+ * Complete hashing cipher text and hash the AAD and cipher text lengths.
+ *
+ * @param [in, out] aes  AES GCM object.
+ * @param [out]     s    Authentication tag.
+ * @param [in]      sSz  Size of authentication tag required.
+ */
+static void GHASH_FINAL_AARCH64(Aes* aes, byte* s, word32 sSz)
+{
+    /* AAD block incomplete when > 0 */
+    byte over = aes->aOver;
+
+    if (aes->cOver > 0) {
+        /* Cipher text block incomplete. */
+        over = aes->cOver;
+    }
+    if (over > 0) {
+        /* Zeroize the unused part of the block. */
+        XMEMSET(AES_LASTGBLOCK(aes) + over, 0, AES_BLOCK_SIZE - over);
+        /* Hash the last block of cipher text. */
+        GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes));
+    }
+    /* Hash in the lengths of AAD and cipher text in bits */
+    GHASH_LEN_BLOCK_AARCH64(aes);
+    /* Copy the result into s. */
+    XMEMCPY(s, AES_TAG(aes), sSz);
+}
+
+void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz)
+{
+    ALIGN32 byte counter[AES_BLOCK_SIZE];
+
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */
+        XMEMCPY(counter, iv, ivSz);
+        XMEMSET(counter + GCM_NONCE_MID_SZ, 0,
+                                         AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1);
+        counter[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        /* Counter is GHASH of IV. */
+    #ifdef OPENSSL_EXTRA
+        word32 aadTemp = aes->gcm.aadLen;
+        aes->gcm.aadLen = 0;
+    #endif
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
+    #ifdef OPENSSL_EXTRA
+        aes->gcm.aadLen = aadTemp;
+    #endif
+    }
+
+    /* Copy in the counter for use with cipher. */
+    XMEMCPY(AES_COUNTER(aes), counter, AES_BLOCK_SIZE);
+    /* Encrypt initial counter into a buffer for GCM. */
+    AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key,
+        (int)aes->rounds);
+}
+
+void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out, const byte* in,
+    word32 sz)
+{
+    word32 blocks;
+    word32 partial;
+
+    /* Check if previous encrypted block was not used up. */
+    if (aes->over > 0) {
+        byte pSz = AES_BLOCK_SIZE - aes->over;
+        if (pSz > sz) pSz = sz;
+
+        /* Use some/all of last encrypted block. */
+        xorbufout(out, AES_LASTBLOCK(aes) + aes->over, in, pSz);
+        aes->over = (aes->over + pSz) & (AES_BLOCK_SIZE - 1);
+
+        /* Some data used. */
+        sz  -= pSz;
+        in  += pSz;
+        out += pSz;
+    }
+
+    /* Calculate the number of blocks needing to be encrypted and any leftover.
+     */
+    blocks  = sz / AES_BLOCK_SIZE;
+    partial = sz & (AES_BLOCK_SIZE - 1);
+
+    /* Encrypt block by block. */
+    while (blocks--) {
+        ALIGN32 byte scratch[AES_BLOCK_SIZE];
+        IncrementGcmCounter(AES_COUNTER(aes));
+        /* Encrypt counter into a buffer. */
+        AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key,
+            (int)aes->rounds);
+        /* XOR plain text into encrypted counter into cipher text buffer. */
+        xorbufout(out, scratch, in, AES_BLOCK_SIZE);
+        /* Data complete. */
+        in  += AES_BLOCK_SIZE;
+        out += AES_BLOCK_SIZE;
+    }
+
+    if (partial != 0) {
+        /* Generate an extra block and use up as much as needed. */
+        IncrementGcmCounter(AES_COUNTER(aes));
+        /* Encrypt counter into cache. */
+        AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes),
+            (byte*)aes->key, (int)aes->rounds);
+        /* XOR plain text into encrypted counter into cipher text buffer. */
+        xorbufout(out, AES_LASTBLOCK(aes), in, partial);
+        /* Keep amount of encrypted block used. */
+        aes->over = partial;
+    }
+}
+
+/* Calculates authentication tag for AES GCM. C implementation.
+ *
+ * @param [in, out] aes        AES object.
+ * @param [out]     authTag    Buffer to store authentication tag in.
+ * @param [in]      authTagSz  Length of tag to create.
+ */
+void AES_GCM_final_AARCH64(Aes* aes, byte* authTag, word32 authTagSz)
+{
+    /* Calculate authentication tag. */
+    GHASH_FINAL_AARCH64(aes, authTag, authTagSz);
+    /* XOR in as much of encrypted counter as is required. */
+    xorbuf(authTag, AES_INITCTR(aes), authTagSz);
+#ifdef OPENSSL_EXTRA
+    /* store AAD size for next call */
+    aes->gcm.aadLen = aes->aSz;
+#endif
+    /* Zeroize last block to protect sensitive data. */
+    ForceZero(AES_LASTBLOCK(aes), AES_BLOCK_SIZE);
 }
+#endif /* WOLFSSL_AESGCM_STREAM */
 
 #ifdef WOLFSSL_AES_128
 /* internal function : see wc_AesGcmEncrypt */
-static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz,
     const byte* authIn, word32 authInSz)
 {
@@ -1924,8 +2070,8 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -3543,14 +3689,11 @@ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
           "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
           "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
     );
-
-
-    return 0;
 }
 #endif /* WOLFSSL_AES_128 */
 #ifdef WOLFSSL_AES_192
 /* internal function : see wc_AesGcmEncrypt */
-static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz,
     const byte* authIn, word32 authInSz)
 {
@@ -3570,8 +3713,8 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -5306,14 +5449,11 @@ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
           "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
           "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
     );
-
-
-    return 0;
 }
 #endif /* WOLFSSL_AES_192 */
 #ifdef WOLFSSL_AES_256
 /* internal function : see wc_AesGcmEncrypt */
-static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz,
     const byte* authIn, word32 authInSz)
 {
@@ -5333,8 +5473,8 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -7200,9 +7340,6 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
           "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
           "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
     );
-
-
-    return 0;
 }
 #endif /* WOLFSSL_AES_256 */
 
@@ -7227,41 +7364,29 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
  * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using
  * Algorithm 5
  */
-int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
     const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz,
     const byte* authIn, word32 authInSz)
 {
-    /* sanity checks */
-    if ((aes == NULL) || (iv == NULL && ivSz > 0) || (authTag == NULL) ||
-            ((authIn == NULL) && (authInSz > 0)) || (ivSz == 0)) {
-        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
-        return BAD_FUNC_ARG;
-    }
-
-    if ((authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) || (authTagSz > AES_BLOCK_SIZE)) {
-        WOLFSSL_MSG("GcmEncrypt authTagSz error");
-        return BAD_FUNC_ARG;
-    }
-
     switch (aes->rounds) {
 #ifdef WOLFSSL_AES_128
         case 10:
-            return Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+                authIn, authInSz);
+            break;
 #endif
 #ifdef WOLFSSL_AES_192
         case 12:
-            return Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+                 authIn, authInSz);
+            break;
 #endif
 #ifdef WOLFSSL_AES_256
         case 14:
-            return Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz,
+                authIn, authInSz);
+            break;
 #endif
-        default:
-            WOLFSSL_MSG("AES-GCM invalid round number");
-            return BAD_FUNC_ARG;
     }
 }
 
@@ -7284,8 +7409,8 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -8935,8 +9060,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -10703,8 +10828,8 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
         counter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
-        GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
-        GMULT(counter, aes->gcm.H);
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
     }
 
     __asm__ __volatile__ (
@@ -12587,38 +12712,30 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
  * authIn:    additional data buffer
  * authInSz:  size of additional data buffer
  */
-int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz,
     const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz,
     const byte* authIn, word32 authInSz)
 {
     /* sanity checks */
-    if ((aes == NULL) || (iv == NULL) || (authTag == NULL) ||
-            (authTagSz > AES_BLOCK_SIZE) || (authTagSz == 0) || (ivSz == 0) ||
-            ((sz != 0) && ((in == NULL) || (out == NULL)))) {
-        WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0");
-        return BAD_FUNC_ARG;
-    }
-
     switch (aes->rounds) {
 #ifdef WOLFSSL_AES_128
         case 10:
-            return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag,
+                authTagSz, authIn, authInSz);
 #endif
 #ifdef WOLFSSL_AES_192
         case 12:
-            return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag,
+                authTagSz, authIn, authInSz);
 #endif
 #ifdef WOLFSSL_AES_256
         case 14:
-            return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz,
-                                    authTag, authTagSz, authIn, authInSz);
+            return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag,
+                authTagSz, authIn, authInSz);
 #endif
-        default:
-            WOLFSSL_MSG("AES-GCM invalid round number");
-            return BAD_FUNC_ARG;
     }
+
+    return BAD_FUNC_ARG;
 }
 
 #endif /* HAVE_AES_DECRYPT */
@@ -14179,6 +14296,7 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     while (blocks--) {
         IncrementGcmCounter(ctr);
         wc_AesEncrypt(aes, ctr, scratch);
+#endif
         xorbuf(scratch, c, AES_BLOCK_SIZE);
         XMEMCPY(p, scratch, AES_BLOCK_SIZE);
         p += AES_BLOCK_SIZE;
@@ -14201,10 +14319,9 @@ int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
 #endif /* HAVE_AES_DECRYPT */
 #endif /* HAVE_AESGCM */
 
-#endif /* aarch64 */
-
 #ifdef HAVE_AESGCM
 #ifdef WOLFSSL_AESGCM_STREAM
+#ifndef __aarch64__
     /* Access initialization counter data. */
     #define AES_INITCTR(aes)        ((aes)->streamData + 0 * AES_BLOCK_SIZE)
     /* Access counter data. */
@@ -14422,8 +14539,13 @@ static void AesGcmInit_C(Aes* aes, const byte* iv, word32 ivSz)
         word32 aadTemp = aes->gcm.aadLen;
         aes->gcm.aadLen = 0;
     #endif
+    #ifdef __aarch64__
+        GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
+        GMULT_AARCH64(counter, aes->gcm.H);
+    #else
         GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE);
         GMULT(counter, aes->gcm.H);
+    #endif
     #ifdef OPENSSL_EXTRA
         aes->gcm.aadLen = aadTemp;
     #endif
@@ -14432,7 +14554,8 @@ static void AesGcmInit_C(Aes* aes, const byte* iv, word32 ivSz)
     /* Copy in the counter for use with cipher. */
     XMEMCPY(AES_COUNTER(aes), counter, AES_BLOCK_SIZE);
     /* Encrypt initial counter into a buffer for GCM. */
-    wc_AesEncrypt(aes, counter, AES_INITCTR(aes));
+    AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key,
+        aes->rounds);
     /* Reset state fields. */
     aes->over = 0;
     aes->aSz = 0;
@@ -14480,7 +14603,8 @@ static void AesGcmCryptUpdate_C(Aes* aes, byte* out, const byte* in, word32 sz)
         ALIGN32 byte scratch[AES_BLOCK_SIZE];
         IncrementGcmCounter(AES_COUNTER(aes));
         /* Encrypt counter into a buffer. */
-        wc_AesEncrypt(aes, AES_COUNTER(aes), scratch);
+        AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key,
+             aes->rounds);
         /* XOR plain text into encrypted counter into cipher text buffer. */
         xorbufout(out, scratch, in, AES_BLOCK_SIZE);
         /* Data complete. */
@@ -14492,7 +14616,8 @@ static void AesGcmCryptUpdate_C(Aes* aes, byte* out, const byte* in, word32 sz)
         /* Generate an extra block and use up as much as needed. */
         IncrementGcmCounter(AES_COUNTER(aes));
         /* Encrypt counter into cache. */
-        wc_AesEncrypt(aes, AES_COUNTER(aes), AES_LASTBLOCK(aes));
+        AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes),
+            (byte*)aes->key, (int)aes->rounds);
         /* XOR plain text into encrypted counter into cipher text buffer. */
         xorbufout(out, AES_LASTBLOCK(aes), in, partial);
         /* Keep amount of encrypted block used. */
@@ -14836,11 +14961,13 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz)
     return ret;
 }
 #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */
+#endif /* !__aarch64__ */
 #endif /* WOLFSSL_AESGCM_STREAM */
 #endif /* HAVE_AESGCM */
 
 
 #ifdef HAVE_AESCCM
+#ifndef __aarch64__
 /* Software version of AES-CCM from wolfcrypt/src/aes.c
  * Gets some speed up from hardware acceleration of wc_AesEncrypt */
 
@@ -15110,11 +15237,30 @@ int  wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
     return result;
 }
 #endif /* HAVE_AES_DECRYPT */
+#endif /* !__aarch64__ */
 #endif /* HAVE_AESCCM */
 
 
 
 #ifdef HAVE_AESGCM /* common GCM functions 32 and 64 bit */
+#if defined(__aarch64__)
+void AES_GCM_set_key_AARCH64(Aes* aes, byte* iv)
+{
+
+    AES_encrypt_AARCH64(iv, aes->gcm.H, (byte*)aes->key, aes->rounds);
+    {
+        word32* pt = (word32*)aes->gcm.H;
+        __asm__ volatile (
+            "LD1 {v0.16b}, [%[h]] \n"
+            "RBIT v0.16b, v0.16b \n"
+            "ST1 {v0.16b}, [%[out]] \n"
+            : [out] "=r" (pt)
+            : [h] "0" (pt)
+            : "cc", "memory", "v0"
+        );
+    }
+}
+#else
 int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 {
     int  ret;
@@ -15132,19 +15278,6 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 #endif
 
         wc_AesEncrypt(aes, iv, aes->gcm.H);
-    #if defined(__aarch64__)
-        {
-            word32* pt = (word32*)aes->gcm.H;
-            __asm__ volatile (
-                "LD1 {v0.16b}, [%[h]] \n"
-                "RBIT v0.16b, v0.16b \n"
-                "ST1 {v0.16b}, [%[out]] \n"
-                : [out] "=r" (pt)
-                : [h] "0" (pt)
-                : "cc", "memory", "v0"
-            );
-        }
-    #else
         {
             word32* pt = (word32*)aes->gcm.H;
             __asm__ volatile (
@@ -15157,14 +15290,15 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
                 : "cc", "memory", "q0"
             );
         }
-    #endif
     }
 
     return ret;
 }
+#endif
 
 #endif /* HAVE_AESGCM */
 
+#ifndef __aarch64__
 /* AES-DIRECT */
 #if defined(WOLFSSL_AES_DIRECT)
         /* Allow direct access to one block encrypt */
@@ -15188,6 +15322,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
         }
     #endif /* HAVE_AES_DECRYPT */
 #endif /* WOLFSSL_AES_DIRECT */
+#endif /* !__aarch64__ */
 
 #ifdef WOLFSSL_AES_XTS
 
@@ -15371,26 +15506,12 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
  *
  * returns 0 on success
  */
-int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
-        const byte* i, word32 iSz)
+void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
+        const byte* i)
 {
-    int ret = 0;
     word32 blocks = (sz / AES_BLOCK_SIZE);
     byte tmp[AES_BLOCK_SIZE];
 
-    if (xaes == NULL || out == NULL || in == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (iSz < AES_BLOCK_SIZE) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (blocks == 0) {
-        WOLFSSL_MSG("Plain text input too small for encryption");
-        return BAD_FUNC_ARG;
-    }
-
     __asm__ __volatile__ (
         "MOV x19, 0x87 \n"
 
@@ -15691,8 +15812,6 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
           "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
           "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
     );
-
-    return ret;
 }
 
 /* Same process as encryption but Aes key is AES_DECRYPTION type.
@@ -15707,27 +15826,13 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
  *
  * returns 0 on success
  */
-int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
-        const byte* i, word32 iSz)
+void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out, const byte* in, word32 sz,
+     const byte* i)
 {
-    int ret = 0;
     word32 blocks = (sz / AES_BLOCK_SIZE);
     byte tmp[AES_BLOCK_SIZE];
     byte stl = (sz % AES_BLOCK_SIZE);
 
-    if (xaes == NULL || out == NULL || in == NULL) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (iSz < AES_BLOCK_SIZE) {
-        return BAD_FUNC_ARG;
-    }
-
-    if (blocks == 0) {
-        WOLFSSL_MSG("Plain text input too small for encryption");
-        return BAD_FUNC_ARG;
-    }
-
     /* if Stealing then break out of loop one block early to handle special
      * case */
     blocks -= (stl > 0);
@@ -16039,8 +16144,6 @@ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
           "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
           "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
     );
-
-    return ret;
 }
 #else
 
@@ -16556,6 +16659,7 @@ extern void GCM_gmult_len(byte* x, /* const */ byte m[32][AES_BLOCK_SIZE],
 extern void AES_GCM_encrypt(const unsigned char* in, unsigned char* out,
     unsigned long len, const unsigned char* ks, int nr, unsigned char* ctr);
 
+#ifndef __aarch64__
 int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
             const byte* iv, int dir)
 {
@@ -17144,9 +17248,22 @@ static WC_INLINE void RIGHTSHIFTX(byte* x)
 }
 
 #if defined(GCM_TABLE) || defined(GCM_TABLE_4BIT)
+
+#if defined(__aarch64__) && !defined(BIG_ENDIAN_ORDER)
+static WC_INLINE void Shift4_M0(byte *r8, byte *z8)
+{
+    int i;
+    for (i = 15; i > 0; i--)
+        r8[i] = (byte)(z8[i-1] << 4) | (byte)(z8[i] >> 4);
+    r8[0] = (byte)(z8[0] >> 4);
+}
+#endif
+
 void GenerateM0(Gcm* gcm)
 {
+#if !defined(__aarch64__) || !defined(BIG_ENDIAN_ORDER)
     int i;
+#endif
     byte (*m)[AES_BLOCK_SIZE] = gcm->M0;
 
     /* 0 times -> 0x0 */
@@ -17191,6 +17308,7 @@ void GenerateM0(Gcm* gcm)
     XMEMCPY(m[0xf], m[0x8], AES_BLOCK_SIZE);
     xorbuf (m[0xf], m[0x7], AES_BLOCK_SIZE);
 
+#ifndef __aarch64__
     for (i = 0; i < 16; i++) {
         word32* m32 = (word32*)gcm->M0[i];
         m32[0] = ByteReverseWord32(m32[0]);
@@ -17198,6 +17316,11 @@ void GenerateM0(Gcm* gcm)
         m32[2] = ByteReverseWord32(m32[2]);
         m32[3] = ByteReverseWord32(m32[3]);
     }
+#elif !defined(BIG_ENDIAN_ORDER)
+    for (i = 0; i < 16; i++) {
+        Shift4_M0(m[16+i], m[i]);
+    }
+#endif
 }
 #endif /* GCM_TABLE */
 
@@ -17235,6 +17358,7 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
     return ret;
 }
 
+#ifndef __aarch64__
 static WC_INLINE void IncrementGcmCounter(byte* inOutCtr)
 {
     int i;
@@ -17245,6 +17369,7 @@ static WC_INLINE void IncrementGcmCounter(byte* inOutCtr)
             return;
     }
 }
+#endif
 
 static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz)
 {
@@ -17561,6 +17686,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     return 0;
 }
 #endif /* HAVE_AESGCM */
+#endif /* !__aarch64__ */
 
 #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */
 #endif /* !NO_AES && WOLFSSL_ARMASM */
diff --git a/wolfcrypt/src/port/arm/armv8-sha256.c b/wolfcrypt/src/port/arm/armv8-sha256.c
index dabe7af9c3..9d5dc25609 100644
--- a/wolfcrypt/src/port/arm/armv8-sha256.c
+++ b/wolfcrypt/src/port/arm/armv8-sha256.c
@@ -1407,7 +1407,214 @@ static WC_INLINE int Sha256Final(wc_Sha256* sha256, byte* hash)
         return ret;
     }
 
-#else /* */
+#elif defined(__aarch64__)
+
+    static const FLASH_QUALIFIER ALIGN32 word32 K[64] = {
+        0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
+        0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
+        0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
+        0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
+        0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
+        0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
+        0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
+        0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
+        0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
+        0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
+        0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
+        0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
+        0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
+    };
+
+/* Both versions of Ch and Maj are logically the same, but with the second set
+    the compilers can recognize them better for optimization */
+#ifdef WOLFSSL_SHA256_BY_SPEC
+    /* SHA256 math based on specification */
+    #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
+    #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
+#else
+    /* SHA256 math reworked for easier compiler optimization */
+    #define Ch(x,y,z)       ((((y) ^ (z)) & (x)) ^ (z))
+    #define Maj(x,y,z)      ((((x) ^ (y)) & ((y) ^ (z))) ^ (y))
+#endif
+    #define R(x, n)         (((x) & 0xFFFFFFFFU) >> (n))
+
+    #define S(x, n)         rotrFixed(x, n)
+    #define Sigma0(x)       (S(x, 2)  ^ S(x, 13) ^ S(x, 22))
+    #define Sigma1(x)       (S(x, 6)  ^ S(x, 11) ^ S(x, 25))
+    #define Gamma0(x)       (S(x, 7)  ^ S(x, 18) ^ R(x, 3))
+    #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+    #define a(i) S[(0-(i)) & 7]
+    #define b(i) S[(1-(i)) & 7]
+    #define c(i) S[(2-(i)) & 7]
+    #define d(i) S[(3-(i)) & 7]
+    #define e(i) S[(4-(i)) & 7]
+    #define f(i) S[(5-(i)) & 7]
+    #define g(i) S[(6-(i)) & 7]
+    #define h(i) S[(7-(i)) & 7]
+
+    #ifndef XTRANSFORM
+         #define XTRANSFORM(S, D)         Transform_Sha256((S),(D))
+    #endif
+
+#ifndef SHA256_MANY_REGISTERS
+    #define RND(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+(j)] + \
+              W[i+(j)]; \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+
+    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
+    {
+        word32 S[8], t0, t1;
+        int i;
+
+    #ifdef WOLFSSL_SMALL_STACK_CACHE
+        word32* W = sha256->W;
+        if (W == NULL) {
+            W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
+                                                           DYNAMIC_TYPE_DIGEST);
+            if (W == NULL)
+                return MEMORY_E;
+            sha256->W = W;
+        }
+    #elif defined(WOLFSSL_SMALL_STACK)
+        word32* W;
+        W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
+                                                       DYNAMIC_TYPE_TMP_BUFFER);
+        if (W == NULL)
+            return MEMORY_E;
+    #else
+        word32 W[WC_SHA256_BLOCK_SIZE];
+    #endif
+
+        /* Copy context->state[] to working vars */
+        for (i = 0; i < 8; i++)
+            S[i] = sha256->digest[i];
+
+        for (i = 0; i < 16; i++)
+            W[i] = *((const word32*)&data[i*(int)sizeof(word32)]);
+
+        for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
+           W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
+
+    #ifdef USE_SLOW_SHA256
+        /* not unrolled - ~2k smaller and ~25% slower */
+        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
+            int j;
+            for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
+                RND(j);
+            }
+        }
+    #else
+        /* partially loop unrolled */
+        for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
+            RND(0); RND(1); RND(2); RND(3);
+            RND(4); RND(5); RND(6); RND(7);
+        }
+    #endif /* USE_SLOW_SHA256 */
+
+        /* Add the working vars back into digest state[] */
+        for (i = 0; i < 8; i++) {
+            sha256->digest[i] += S[i];
+        }
+
+    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
+        ForceZero(W, sizeof(word32) * WC_SHA256_BLOCK_SIZE);
+        XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    #endif
+    }
+#else
+    /* SHA256 version that keeps all data in registers */
+    #define SCHED1(j) (W[j] = *((word32*)&data[j*sizeof(word32)]))
+    #define SCHED(j) (               \
+                   W[ j     & 15] += \
+            Gamma1(W[(j-2)  & 15])+  \
+                   W[(j-7)  & 15] +  \
+            Gamma0(W[(j-15) & 15])   \
+        )
+
+    #define RND1(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED1(j); \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+    #define RNDN(j) \
+         t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + SCHED(j); \
+         t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
+         d(j) += t0; \
+         h(j)  = t0 + t1
+
+    static void Transform_Sha256(wc_Sha256* sha256, const byte* data)
+    {
+        word32 S[8], t0, t1;
+        int i;
+    #ifdef USE_SLOW_SHA256
+        int j;
+    #endif
+        word32 W[WC_SHA256_BLOCK_SIZE/sizeof(word32)];
+
+        /* Copy digest to working vars */
+        S[0] = sha256->digest[0];
+        S[1] = sha256->digest[1];
+        S[2] = sha256->digest[2];
+        S[3] = sha256->digest[3];
+        S[4] = sha256->digest[4];
+        S[5] = sha256->digest[5];
+        S[6] = sha256->digest[6];
+        S[7] = sha256->digest[7];
+
+        i = 0;
+    #ifdef USE_SLOW_SHA256
+        for (j = 0; j < 16; j++) {
+            RND1(j);
+        }
+        for (i = 16; i < 64; i += 16) {
+            for (j = 0; j < 16; j++) {
+                RNDN(j);
+            }
+        }
+    #else
+        RND1( 0); RND1( 1); RND1( 2); RND1( 3);
+        RND1( 4); RND1( 5); RND1( 6); RND1( 7);
+        RND1( 8); RND1( 9); RND1(10); RND1(11);
+        RND1(12); RND1(13); RND1(14); RND1(15);
+        /* 64 operations, partially loop unrolled */
+        for (i = 16; i < 64; i += 16) {
+            RNDN( 0); RNDN( 1); RNDN( 2); RNDN( 3);
+            RNDN( 4); RNDN( 5); RNDN( 6); RNDN( 7);
+            RNDN( 8); RNDN( 9); RNDN(10); RNDN(11);
+            RNDN(12); RNDN(13); RNDN(14); RNDN(15);
+        }
+    #endif
+
+        /* Add the working vars back into digest */
+        sha256->digest[0] += S[0];
+        sha256->digest[1] += S[1];
+        sha256->digest[2] += S[2];
+        sha256->digest[3] += S[3];
+        sha256->digest[4] += S[4];
+        sha256->digest[5] += S[5];
+        sha256->digest[6] += S[6];
+        sha256->digest[7] += S[7];
+    }
+#endif /* SHA256_MANY_REGISTERS */
+
+static void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
+    word32 len)
+{
+    while (len > 0) {
+        byte tmp[WC_SHA256_BLOCK_SIZE];
+        ByteReverseWords((word32*)tmp, (const word32*)data,
+            WC_SHA256_BLOCK_SIZE);
+        Transform_Sha256(sha256, tmp);
+        data += WC_SHA256_BLOCK_SIZE;
+        len  -= WC_SHA256_BLOCK_SIZE;
+    }
+}
+
+#else
 
 extern void Transform_Sha256_Len(wc_Sha256* sha256, const byte* data,
     word32 len);
diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h
index 61a3433ea9..ab2159abf3 100644
--- a/wolfssl/wolfcrypt/aes.h
+++ b/wolfssl/wolfcrypt/aes.h
@@ -61,7 +61,7 @@ typedef struct Gcm {
 #endif
 
 WOLFSSL_LOCAL void GenerateM0(Gcm* gcm);
-#ifdef WOLFSSL_ARMASM
+#if !defined(__aarch64__) && defined(WOLFSSL_ARMASM)
 WOLFSSL_LOCAL void GMULT(byte* X, byte* Y);
 #endif
 WOLFSSL_LOCAL void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c,
@@ -304,6 +304,13 @@ struct Aes {
 #ifdef WOLFSSL_AESNI
     byte use_aesni;
 #endif /* WOLFSSL_AESNI */
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+    byte use_aes_hw_crypto;
+#ifdef HAVE_AESGCM
+    byte use_pmull_hw_crypto;
+#endif
+#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */
 #ifdef WOLF_CRYPTO_CB
     int    devId;
     void*  devCtx;
@@ -832,6 +839,59 @@ WOLFSSL_API int wc_AesEaxFree(AesEax* eax);
 
 #endif /* WOLFSSL_AES_EAX */
 
+#if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \
+    !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO)
+/* GHASH one block of data.
+ *
+ * XOR block into tag and GMULT with H.
+ *
+ * @param [in, out] aes    AES GCM object.
+ * @param [in]      block  Block of AAD or cipher text.
+ */
+#define GHASH_ONE_BLOCK(aes, block)                     \
+    do {                                                \
+        xorbuf(AES_TAG(aes), block, AES_BLOCK_SIZE);    \
+        GMULT_AARCH64(AES_TAG(aes), aes->gcm.H);        \
+    }                                                   \
+    while (0)
+
+WOLFSSL_LOCAL int AES_set_key_AARCH64(const unsigned char *userKey,
+    const int keylen, Aes* aes, int dir);
+WOLFSSL_LOCAL void AES_encrypt_AARCH64(const byte* inBlock, byte* outBlock,
+    byte* key, int nr);
+WOLFSSL_LOCAL void AES_decrypt_AARCH64(const byte* inBlock, byte* outBlock,
+    byte* key, int nr);
+WOLFSSL_LOCAL void AES_CBC_encrypt_AARCH64(const byte* in, byte* out, word32 sz,
+    byte* reg, byte* key, int rounds);
+WOLFSSL_LOCAL void AES_CBC_decrypt_AARCH64(const byte* in, byte* out, word32 sz,
+    byte* reg, byte* key, int rounds);
+WOLFSSL_LOCAL void AES_CTR_encrypt_AARCH64(Aes* aes, byte* out, const byte* in,
+    word32 sz);
+WOLFSSL_LOCAL void GMULT_AARCH64(byte* X, byte* Y);
+#ifdef WOLFSSL_AESGCM_STREAM
+WOLFSSL_LOCAL void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz,
+    const byte* c, word32 cSz);
+WOLFSSL_LOCAL void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz);
+WOLFSSL_LOCAL void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out,
+    const byte* in, word32 sz);
+WOLFSSL_LOCAL void AES_GCM_final_AARCH64(Aes* aes, byte* authTag,
+    word32 authTagSz);
+#endif
+WOLFSSL_LOCAL void AES_GCM_set_key_AARCH64(Aes* aes, byte* iv);
+WOLFSSL_LOCAL void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in,
+    word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz,
+    const byte* authIn, word32 authInSz);
+WOLFSSL_LOCAL int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in,
+    word32 sz, const byte* iv, word32 ivSz, const byte* authTag,
+    word32 authTagSz, const byte* authIn, word32 authInSz);
+
+#ifdef WOLFSSL_AES_XTS
+WOLFSSL_LOCAL void AES_XTS_encrypt_AARCH64(XtsAes* xaes, byte* out,
+    const byte* in, word32 sz, const byte* i);
+WOLFSSL_LOCAL void AES_XTS_decrypt_AARCH64(XtsAes* xaes, byte* out,
+    const byte* in, word32 sz, const byte* i);
+#endif /* WOLFSSL_AES_XTS */
+#endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */
 
 #ifdef __cplusplus
     } /* extern "C" */
diff --git a/wolfssl/wolfcrypt/cpuid.h b/wolfssl/wolfcrypt/cpuid.h
index c91b628b5b..b7a5714798 100644
--- a/wolfssl/wolfcrypt/cpuid.h
+++ b/wolfssl/wolfcrypt/cpuid.h
@@ -38,6 +38,11 @@
     #define HAVE_CPUID
     #define HAVE_CPUID_INTEL
 #endif
+#if (defined(WOLFSSL_AARCH64_BUILD) || (defined(__aarch64__) && \
+     defined(WOLFSSL_ARMASM))) && !defined(WOLFSSL_NO_ASM)
+    #define HAVE_CPUID
+    #define HAVE_CPUID_AARCH64
+#endif
 
 #ifdef HAVE_CPUID_INTEL
 
@@ -63,6 +68,26 @@
     #define IS_INTEL_BMI1(f)    ((f) & CPUID_BMI1)
     #define IS_INTEL_SHA(f)     ((f) & CPUID_SHA)
 
+#elif defined(HAVE_CPUID_AARCH64)
+
+    #define CPUID_AES         0x0001
+    #define CPUID_PMULL       0x0002
+    #define CPUID_SHA256      0x0004
+    #define CPUID_SHA512      0x0008
+    #define CPUID_RDM         0x0010
+    #define CPUID_SHA3        0x0020
+    #define CPUID_SM3         0x0040
+    #define CPUID_SM4         0x0080
+
+    #define IS_AARCH64_AES(f)       ((f) & CPUID_AES)
+    #define IS_AARCH64_PMULL(f)     ((f) & CPUID_PMULL)
+    #define IS_AARCH64_SHA256(f)    ((f) & CPUID_SHA256)
+    #define IS_AARCH64_SHA512(f)    ((f) & CPUID_SHA512)
+    #define IS_AARCH64_RDM(f)       ((f) & CPUID_RDM)
+    #define IS_AARCH64_SHA3(f)      ((f) & CPUID_SHA3)
+    #define IS_AARCH64_SM3(f)       ((f) & CPUID_SM3)
+    #define IS_AARCH64_SM4(f)       ((f) & CPUID_SM4)
+
 #endif
 
 #ifdef HAVE_CPUID