From 97c0f4c9295c45f50fdada45a05aac633bc55cb9 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 5 Aug 2022 14:59:19 +0200 Subject: [PATCH] Add new SHA2 files The C files are generic public domain files which are mostly faster the the BSD ones - this depends on compiler and it's flags of cause ;) The assembly files are taken from current openssl master, and are licensed under the Apache License Version 2.0: - sha256-x86_64.S: SSSE3, AVX, AVX2, SHA-NI - sha512-x86_64.S: AVX, AVX2 - sha256-armv8.S: NEON, ARMv8 Cryptography Extension - sha512-armv8.S: ARMv8 Cryptography Extension - sha256-ppc.S: generic PPC64 LE/BE - sha512-ppc.S: generic PPC64 LE/BE - sha256-p8.S: Power ISA Version 2.07 LE/BE - sha512-p8.S: Power ISA Version 2.07 LE/BE Signed-off-by: Tino Reichardt --- cmd/ztest.c | 6 +- include/Makefile.am | 2 + include/os/freebsd/spl/sys/mod_os.h | 9 + include/sys/blake3.h | 31 +- include/sys/sha2.h | 107 + include/sys/zfs_impl.h | 69 + include/sys/zio_checksum.h | 6 +- lib/libicp/Makefile.am | 23 +- lib/libspl/include/Makefile.am | 1 - lib/libzfs/Makefile.am | 3 - lib/libzpool/Makefile.am | 2 +- module/Kbuild.in | 23 +- module/Makefile.bsd | 29 +- module/icp/algs/blake3/blake3.c | 16 +- module/icp/algs/blake3/blake3_generic.c | 7 +- module/icp/algs/blake3/blake3_impl.c | 483 ++- module/icp/algs/blake3/blake3_impl.h | 34 +- module/icp/algs/blake3/blake3_x86-64.c | 248 -- module/icp/algs/sha2/sha256_impl.c | 271 ++ module/icp/algs/sha2/sha2_generic.c | 564 +++ module/icp/algs/sha2/sha2_impl.h | 61 + module/icp/algs/sha2/sha512_impl.c | 223 ++ module/icp/asm-aarch64/sha2/sha256-armv8.S | 1020 ++++++ module/icp/asm-aarch64/sha2/sha512-armv8.S | 580 +++ module/icp/asm-ppc64/sha2/sha256-p8.S | 1498 ++++++++ module/icp/asm-ppc64/sha2/sha256-ppc.S | 2705 ++++++++++++++ module/icp/asm-ppc64/sha2/sha512-p8.S | 1699 +++++++++ module/icp/asm-ppc64/sha2/sha512-ppc.S | 2945 +++++++++++++++ module/icp/asm-x86_64/sha2/sha256-x86_64.S | 3742 ++++++++++++++++++++ module/icp/asm-x86_64/sha2/sha512-x86_64.S | 2650 ++++++++++++++ module/icp/illumos-crypto.c | 2 - module/icp/include/generic_impl.c | 240 ++ module/icp/io/sha2_mod.c | 1290 ------- module/os/freebsd/zfs/zio_crypt.c | 4 +- module/os/linux/zfs/zio_crypt.c | 4 +- module/zfs/sha2_zfs.c | 30 +- module/zfs/zfs_chksum.c | 87 +- module/zfs/zfs_impl.c | 60 + module/zfs/zio_checksum.c | 8 +- tests/zfs-tests/cmd/checksum/blake3_test.c | 18 +- tests/zfs-tests/cmd/checksum/sha2_test.c | 62 +- 41 files changed, 18963 insertions(+), 1899 deletions(-) create mode 100644 include/sys/sha2.h create mode 100644 include/sys/zfs_impl.h delete mode 100644 module/icp/algs/blake3/blake3_x86-64.c create mode 100644 module/icp/algs/sha2/sha256_impl.c create mode 100644 module/icp/algs/sha2/sha2_generic.c create mode 100644 module/icp/algs/sha2/sha2_impl.h create mode 100644 module/icp/algs/sha2/sha512_impl.c create mode 100644 module/icp/asm-aarch64/sha2/sha256-armv8.S create mode 100644 module/icp/asm-aarch64/sha2/sha512-armv8.S create mode 100644 module/icp/asm-ppc64/sha2/sha256-p8.S create mode 100644 module/icp/asm-ppc64/sha2/sha256-ppc.S create mode 100644 module/icp/asm-ppc64/sha2/sha512-p8.S create mode 100644 module/icp/asm-ppc64/sha2/sha512-ppc.S create mode 100644 module/icp/asm-x86_64/sha2/sha256-x86_64.S create mode 100644 module/icp/asm-x86_64/sha2/sha512-x86_64.S create mode 100644 module/icp/include/generic_impl.c delete mode 100644 module/icp/io/sha2_mod.c create mode 100644 module/zfs/zfs_impl.c diff --git a/cmd/ztest.c b/cmd/ztest.c index 31b9990a1fcf..6e83dcf906a4 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -135,6 +135,7 @@ #include #include #include +#include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif @@ -6389,6 +6390,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) int i, *ptr; uint32_t size; BLAKE3_CTX ctx; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); @@ -6413,7 +6415,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) void *res2 = &zc_res2; /* BLAKE3_KEY_LEN = 32 */ - VERIFY0(blake3_set_impl_name("generic")); + VERIFY0(blake3->setname("generic")); templ = abd_checksum_blake3_tmpl_init(&salt); Blake3_InitKeyed(&ctx, salt_ptr); Blake3_Update(&ctx, buf, size); @@ -6422,7 +6424,7 @@ ztest_blake3(ztest_ds_t *zd, uint64_t id) ZIO_CHECKSUM_BSWAP(&zc_ref2); abd_checksum_blake3_tmpl_free(templ); - VERIFY0(blake3_set_impl_name("cycle")); + VERIFY0(blake3->setname("cycle")); while (run_count-- > 0) { /* Test current implementation */ diff --git a/include/Makefile.am b/include/Makefile.am index 1a7f67e9c440..79bdc3b3aee2 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -74,6 +74,7 @@ COMMON_H = \ sys/rrwlock.h \ sys/sa.h \ sys/sa_impl.h \ + sys/sha2.h \ sys/skein.h \ sys/spa.h \ sys/spa_boot.h \ @@ -124,6 +125,7 @@ COMMON_H = \ sys/zfs_delay.h \ sys/zfs_file.h \ sys/zfs_fuid.h \ + sys/zfs_impl.h \ sys/zfs_project.h \ sys/zfs_quota.h \ sys/zfs_racct.h \ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 3a9ebbfc3bc4..67a83c95d778 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -92,6 +92,15 @@ #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" +#define blake3_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, blake3_param, "A" + +#define sha256_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, sha256_param, "A" + +#define sha512_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, sha512_param, "A" + #include #define module_init(fn) \ static void \ diff --git a/include/sys/blake3.h b/include/sys/blake3.h index 19500585f382..b981b18db943 100644 --- a/include/sys/blake3.h +++ b/include/sys/blake3.h @@ -22,11 +22,11 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor - * Copyright (c) 2021 Tino Reichardt + * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_H -#define BLAKE3_H +#ifndef _SYS_BLAKE3_H +#define _SYS_BLAKE3_H #ifdef _KERNEL #include @@ -72,7 +72,7 @@ typedef struct { */ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; - /* const blake3_impl_ops_t *ops */ + /* const blake3_ops_t *ops */ const void *ops; } BLAKE3_CTX; @@ -97,29 +97,8 @@ extern void **blake3_per_cpu_ctx; extern void blake3_per_cpu_ctx_init(void); extern void blake3_per_cpu_ctx_fini(void); -/* return number of supported implementations */ -extern int blake3_get_impl_count(void); - -/* return id of selected implementation */ -extern int blake3_get_impl_id(void); - -/* return name of selected implementation */ -extern const char *blake3_get_impl_name(void); - -/* setup id as fastest implementation */ -extern void blake3_set_impl_fastest(uint32_t id); - -/* set implementation by id */ -extern void blake3_set_impl_id(uint32_t id); - -/* set implementation by name */ -extern int blake3_set_impl_name(const char *name); - -/* set startup implementation */ -extern void blake3_setup_impl(void); - #ifdef __cplusplus } #endif -#endif /* BLAKE3_H */ +#endif /* _SYS_BLAKE3_H */ diff --git a/include/sys/sha2.h b/include/sys/sha2.h new file mode 100644 index 000000000000..895b593ac817 --- /dev/null +++ b/include/sys/sha2.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA256_BLOCK_LENGTH 64 +#define SHA384_BLOCK_LENGTH 128 +#define SHA512_BLOCK_LENGTH 128 + +#define SHA224_DIGEST_LENGTH 28 +#define SHA256_DIGEST_LENGTH 32 +#define SHA384_DIGEST_LENGTH 48 +#define SHA512_DIGEST_LENGTH 64 + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_256_DIGEST_LENGTH 32 + +/* sha256 context */ +typedef struct { + uint32_t state[8]; + uint64_t count[2]; + uint8_t wbuf[64]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha256_ctx; + +/* sha512 context */ +typedef struct { + uint64_t state[8]; + uint64_t count[2]; + uint8_t wbuf[128]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha512_ctx; + +/* SHA2 context */ +typedef struct { + union { + sha256_ctx sha256; + sha512_ctx sha512; + }; + + /* algorithm type */ + int algotype; +} SHA2_CTX; + +/* SHA2 algorithm types */ +enum sha2_algotype { + SHA224 = 0, + SHA256, + SHA384, + SHA512, + SHA512_224, + SHA512_256 +}; + +/* SHA2 Init function */ +extern void SHA2Init(SHA2_CTX *ctx, int algotype); + +/* SHA2 Update function */ +extern void SHA2Update(SHA2_CTX *ctx, const void *data, size_t len); + +/* SHA2 Final function */ +extern void SHA2Final(SHA2_CTX *ctx, void *digest); + +#ifdef __cplusplus +} +#endif + +#endif /* SYS_SHA2_H */ diff --git a/include/sys/zfs_impl.h b/include/sys/zfs_impl.h new file mode 100644 index 000000000000..df4899f132b8 --- /dev/null +++ b/include/sys/zfs_impl.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#ifndef _SYS_ZFS_IMPL_H +#define _SYS_ZFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* generic implementation backends */ +typedef struct +{ + /* algorithm name */ + const char *name; + + /* get number of supported implementations */ + uint32_t (*getcnt)(void); + + /* get id of selected implementation */ + uint32_t (*getid)(void); + + /* get name of selected implementation */ + const char *(*getname)(void); + + /* setup id as fastest implementation */ + void (*set_fastest)(uint32_t id); + + /* set implementation by id */ + void (*setid)(uint32_t id); + + /* set implementation by name */ + int (*setname)(const char *val); +} zfs_impl_t; + +/* return some set of function pointer */ +extern const zfs_impl_t *zfs_impl_get_ops(const char *algo); + +extern const zfs_impl_t zfs_blake3_ops; +extern const zfs_impl_t zfs_sha256_ops; +extern const zfs_impl_t zfs_sha512_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IMPL_H */ diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 84e371d6c3ef..c54a6601c00d 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -110,9 +110,9 @@ _SYS_ZIO_CHECKSUM_H const zio_checksum_info_t */ /* SHA2 */ -extern zio_checksum_t abd_checksum_SHA256; -extern zio_checksum_t abd_checksum_SHA512_native; -extern zio_checksum_t abd_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_sha256; +extern zio_checksum_t abd_checksum_sha512_native; +extern zio_checksum_t abd_checksum_sha512_byteswap; /* Skein */ extern zio_checksum_t abd_checksum_skein_native; diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index b7f1d0e1b1e4..8fa9407ec533 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -16,7 +16,6 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/blake3/blake3.c \ module/icp/algs/blake3/blake3_generic.c \ module/icp/algs/blake3/blake3_impl.c \ - module/icp/algs/blake3/blake3_x86-64.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ module/icp/algs/modes/cbc.c \ @@ -26,30 +25,38 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/modes/ctr.c \ module/icp/algs/modes/ccm.c \ module/icp/algs/modes/ecb.c \ - module/icp/algs/sha2/sha2.c \ + module/icp/algs/sha2/sha2_generic.c \ + module/icp/algs/sha2/sha256_impl.c \ + module/icp/algs/sha2/sha512_impl.c \ module/icp/algs/skein/skein.c \ module/icp/algs/skein/skein_block.c \ module/icp/algs/skein/skein_iv.c \ module/icp/illumos-crypto.c \ module/icp/io/aes.c \ - module/icp/io/sha2_mod.c \ module/icp/io/skein_mod.c \ module/icp/core/kcf_sched.c \ module/icp/core/kcf_prov_lib.c \ module/icp/core/kcf_callprov.c \ module/icp/core/kcf_mech_tabs.c \ - module/icp/core/kcf_prov_tabs.c + module/icp/core/kcf_prov_tabs.c \ + module/zfs/zfs_impl.c if TARGET_CPU_AARCH64 nodist_libicp_la_SOURCES += \ module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ - module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S + module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \ + module/icp/asm-aarch64/sha2/sha256-armv8.S \ + module/icp/asm-aarch64/sha2/sha512-armv8.S endif if TARGET_CPU_POWERPC nodist_libicp_la_SOURCES += \ module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \ - module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S + module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S \ + module/icp/asm-ppc64/sha2/sha256-ppc.S \ + module/icp/asm-ppc64/sha2/sha512-ppc.S \ + module/icp/asm-ppc64/sha2/sha256-p8.S \ + module/icp/asm-ppc64/sha2/sha512-p8.S endif if TARGET_CPU_X86_64 @@ -60,8 +67,8 @@ nodist_libicp_la_SOURCES += \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ - module/icp/asm-x86_64/sha2/sha256_impl.S \ - module/icp/asm-x86_64/sha2/sha512_impl.S \ + module/icp/asm-x86_64/sha2/sha256-x86_64.S \ + module/icp/asm-x86_64/sha2/sha512-x86_64.S \ module/icp/asm-x86_64/blake3/blake3_avx2.S \ module/icp/asm-x86_64/blake3/blake3_avx512.S \ module/icp/asm-x86_64/blake3/blake3_sse2.S \ diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index 6f0e1818d22e..6ff254000890 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -45,7 +45,6 @@ libspl_sys_HEADERS = \ %D%/sys/poll.h \ %D%/sys/priv.h \ %D%/sys/processor.h \ - %D%/sys/sha2.h \ %D%/sys/simd.h \ %D%/sys/stack.h \ %D%/sys/stdtypes.h \ diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index f5eb84679204..cffe341220c2 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -34,8 +34,6 @@ dist_libzfs_la_SOURCES += \ endif nodist_libzfs_la_SOURCES = \ - module/icp/algs/sha2/sha2.c \ - \ module/zcommon/cityhash.c \ module/zcommon/zfeature_common.c \ module/zcommon/zfs_comutil.c \ @@ -52,7 +50,6 @@ nodist_libzfs_la_SOURCES = \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c - libzfs_la_LIBADD = \ libshare.la \ libzfs_core.la \ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index eaa920e56106..47543d7ff0e4 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -118,7 +118,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/refcount.c \ module/zfs/rrwlock.c \ module/zfs/sa.c \ - module/zfs/sha256.c \ + module/zfs/sha2_zfs.c \ module/zfs/skein_zfs.c \ module/zfs/spa.c \ module/zfs/spa_boot.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 2450da1918b2..43992fa2bea8 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -79,7 +79,6 @@ ICP_OBJS := \ algs/blake3/blake3.o \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ - algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ @@ -88,6 +87,9 @@ ICP_OBJS := \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ + algs/sha2/sha2_generic.o \ + algs/sha2/sha256_impl.o \ + algs/sha2/sha512_impl.o \ algs/skein/skein.o \ algs/skein/skein_block.o \ algs/skein/skein_iv.o \ @@ -101,7 +103,6 @@ ICP_OBJS := \ core/kcf_sched.o \ illumos-crypto.o \ io/aes.o \ - io/sha2_mod.o \ io/skein_mod.o \ spi/kcf_spi.o @@ -113,25 +114,30 @@ ICP_OBJS_X86_64 := \ asm-x86_64/blake3/blake3_avx512.o \ asm-x86_64/blake3/blake3_sse2.o \ asm-x86_64/blake3/blake3_sse41.o \ + asm-x86_64/sha2/sha256-x86_64.o \ + asm-x86_64/sha2/sha512-x86_64.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o - ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o - ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ - asm-aarch64/blake3/b3_aarch64_sse41.o - + asm-aarch64/blake3/b3_aarch64_sse41.o \ + asm-aarch64/sha2/sha256-armv8.o \ + asm-aarch64/sha2/sha512-armv8.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ - asm-ppc64/blake3/b3_ppc64le_sse41.o + asm-ppc64/blake3/b3_ppc64le_sse41.o \ + asm-ppc64/sha2/sha256-p8.o \ + asm-ppc64/sha2/sha512-p8.o \ + asm-ppc64/sha2/sha256-ppc.o \ + asm-ppc64/sha2/sha512-ppc.o zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) @@ -153,6 +159,8 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ # utility tries to interpret them as opcodes and obviously fails doing so. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y +OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y +OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y LUA_OBJS := \ lapi.o \ @@ -381,6 +389,7 @@ ZFS_OBJS := \ zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ + zfs_impl.o \ zfs_ioctl.o \ zfs_log.o \ zfs_onexit.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 56742c8ad49f..190a5a4094a0 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -14,6 +14,10 @@ KMOD= openzfs ${SRCDIR}/icp/asm-aarch64/blake3 \ ${SRCDIR}/icp/asm-ppc64/blake3 \ ${SRCDIR}/icp/asm-x86_64/blake3 \ + ${SRCDIR}/icp/algs/sha2 \ + ${SRCDIR}/icp/asm-aarch64/sha2 \ + ${SRCDIR}/icp/asm-ppc64/sha2 \ + ${SRCDIR}/icp/asm-x86_64/sha2 \ ${SRCDIR}/lua \ ${SRCDIR}/nvpair \ ${SRCDIR}/icp/algs/edonr \ @@ -27,8 +31,6 @@ KMOD= openzfs ${SRCDIR}/zstd/lib/compress \ ${SRCDIR}/zstd/lib/decompress - - CFLAGS+= -I${.OBJDIR:H}/include CFLAGS+= -I${INCDIR} CFLAGS+= -I${INCDIR}/os/freebsd @@ -88,8 +90,7 @@ SRCS+= edonr.c #icp/algs/blake3 SRCS+= blake3.c \ blake3_generic.c \ - blake3_impl.c \ - blake3_x86-64.c + blake3_impl.c #icp/asm-aarch64/blake3 SRCS+= b3_aarch64_sse2.S \ @@ -105,6 +106,25 @@ SRCS+= blake3_avx2.S \ blake3_sse2.S \ blake3_sse41.S +#icp/algs/sha2 +SRCS+= sha2_generic.c \ + sha256_impl.c \ + sha512_impl.c + +#icp/asm-aarch64/sha2 +SRCS+= sha256-armv8.S \ + sha512-armv8.S + +#icp/asm-ppc64/sha2 +SRCS+= sha256-p8.S \ + sha512-p8.S \ + sha256-ppc.S \ + sha512-ppc.S + +#icp/asm-x86_64/sha2 +SRCS+= sha256-x86_64.S \ + sha512-x86_64.S + #lua SRCS+= lapi.c \ lauxlib.c \ @@ -320,6 +340,7 @@ SRCS+= abd.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ + zfs_impl.c \ zfs_ioctl.c \ zfs_log.c \ zfs_onexit.c \ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index b9600207b673..3a446ceeb7a9 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -129,7 +129,7 @@ static output_t make_output(const uint32_t input_cv[8], * bytes. For that reason, chaining values in the CV stack are represented as * bytes. */ -static void output_chaining_value(const blake3_impl_ops_t *ops, +static void output_chaining_value(const blake3_ops_t *ops, const output_t *ctx, uint8_t cv[32]) { uint32_t cv_words[8]; @@ -139,7 +139,7 @@ static void output_chaining_value(const blake3_impl_ops_t *ops, store_cv_words(cv, cv_words); } -static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, +static void output_root_bytes(const blake3_ops_t *ops, const output_t *ctx, uint64_t seek, uint8_t *out, size_t out_len) { uint64_t output_block_counter = seek / 64; @@ -163,7 +163,7 @@ static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, } } -static void chunk_state_update(const blake3_impl_ops_t *ops, +static void chunk_state_update(const blake3_ops_t *ops, blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) { if (ctx->buf_len > 0) { @@ -230,7 +230,7 @@ static size_t left_len(size_t content_len) * number of chunks hashed. These chunks are never the root and never empty; * those cases use a different codepath. */ -static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, +static size_t compress_chunks_parallel(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { @@ -274,7 +274,7 @@ static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, * return it as an additional output.) These parents are never the root and * never empty; those cases use a different codepath. */ -static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, +static size_t compress_parents_parallel(const blake3_ops_t *ops, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { @@ -320,7 +320,7 @@ static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, * of implementing this special rule? Because we don't want to limit SIMD or * multi-threading parallelism for that update(). */ -static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, +static size_t blake3_compress_subtree_wide(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { @@ -406,7 +406,7 @@ static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, * As with compress_subtree_wide(), this function is not used on inputs of 1 * chunk or less. That's a different codepath. */ -static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, +static void compress_subtree_to_parent_node(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { @@ -434,7 +434,7 @@ static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], memcpy(ctx->key, key, BLAKE3_KEY_LEN); chunk_state_init(&ctx->chunk, key, flags); ctx->cv_stack_len = 0; - ctx->ops = blake3_impl_get_ops(); + ctx->ops = blake3_get_ops(); } /* diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c index 6c1eb33e89c0..ca7197a26f39 100644 --- a/module/icp/algs/blake3/blake3_generic.c +++ b/module/icp/algs/blake3/blake3_generic.c @@ -187,16 +187,17 @@ static inline void blake3_hash_many_generic(const uint8_t * const *inputs, } } -static inline boolean_t blake3_is_generic_supported(void) +/* the generic implementation is always okay */ +static boolean_t blake3_is_supported(void) { return (B_TRUE); } -const blake3_impl_ops_t blake3_generic_impl = { +const blake3_ops_t blake3_generic_impl = { .compress_in_place = blake3_compress_in_place_generic, .compress_xof = blake3_compress_xof_generic, .hash_many = blake3_hash_many_generic, - .is_supported = blake3_is_generic_supported, + .is_supported = blake3_is_supported, .degree = 4, .name = "generic" }; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index 10741c82de7e..b0538e5bcc55 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -24,184 +24,267 @@ */ #include -#include +#include +#include +#include #include "blake3_impl.h" -static const blake3_impl_ops_t *const blake3_impls[] = { - &blake3_generic_impl, #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse2_impl, -#endif -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse41_impl, -#endif -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) - &blake3_avx2_impl, -#endif -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) - &blake3_avx512_impl, -#endif -}; - -/* this pointer holds current ops for implementation */ -static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; - -/* special implementation selections */ -#define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX-1) -#define IMPL_USER (UINT32_MAX-2) -#define IMPL_PARAM (UINT32_MAX-3) - -#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) -static uint32_t icp_blake3_impl = IMPL_FASTEST; -#define BLAKE3_IMPL_NAME_MAX 16 - -/* id of fastest implementation */ -static uint32_t blake3_fastest_id = 0; +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} -/* currently used id */ -static uint32_t blake3_current_id = 0; +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} -/* id of module parameter (-1 == unused) */ -static int blake3_param_id = -1; +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} -/* return number of supported implementations */ -int -blake3_get_impl_count(void) +static boolean_t blake3_is_sse2_supported(void) { - static int impls = 0; - int i; +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} - if (impls) - return (impls); +const blake3_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif - for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - impls++; - } +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - return (impls); +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); } -/* return id of selected implementation */ -int -blake3_get_impl_id(void) -{ - return (blake3_current_id); +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* return name of selected implementation */ -const char * -blake3_get_impl_name(void) -{ - return (blake3_selected_impl->name); +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* setup id as fastest implementation */ -void -blake3_set_impl_fastest(uint32_t id) +static boolean_t blake3_is_sse41_supported(void) { - blake3_fastest_id = id; +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif } -/* set implementation by id */ -void -blake3_set_impl_id(uint32_t id) -{ - int i, cid; - - /* select fastest */ - if (id == IMPL_FASTEST) - id = blake3_fastest_id; - - /* select next or first */ - if (id == IMPL_CYCLE) - id = (++blake3_current_id) % blake3_get_impl_count(); - - /* 0..N for the real impl */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (cid == id) { - blake3_current_id = cid; - blake3_selected_impl = blake3_impls[i]; - return; - } - cid++; - } +const blake3_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* set implementation by name */ -int -blake3_set_impl_name(const char *name) +static boolean_t blake3_is_avx2_supported(void) { - int i, cid; - - if (strcmp(name, "fastest") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); - blake3_set_impl_id(IMPL_FASTEST); - return (0); - } else if (strcmp(name, "cycle") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); - blake3_set_impl_id(IMPL_CYCLE); - return (0); - } + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); +} - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (strcmp(name, blake3_impls[i]->name) == 0) { - if (icp_blake3_impl == IMPL_PARAM) { - blake3_param_id = cid; - return (0); - } - blake3_selected_impl = blake3_impls[i]; - blake3_current_id = cid; - return (0); - } - cid++; - } +const blake3_ops_t +blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif - return (-EINVAL); +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); } -/* setup implementation */ -void -blake3_setup_impl(void) -{ - switch (IMPL_READ(icp_blake3_impl)) { - case IMPL_PARAM: - blake3_set_impl_id(blake3_param_id); - atomic_swap_32(&icp_blake3_impl, IMPL_USER); - break; - case IMPL_FASTEST: - blake3_set_impl_id(IMPL_FASTEST); - break; - case IMPL_CYCLE: - blake3_set_impl_id(IMPL_CYCLE); - break; - default: - blake3_set_impl_id(blake3_current_id); - break; - } +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* return selected implementation */ -const blake3_impl_ops_t * -blake3_impl_get_ops(void) -{ - /* each call to ops will cycle */ - if (icp_blake3_impl == IMPL_CYCLE) - blake3_set_impl_id(IMPL_CYCLE); +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} - return (blake3_selected_impl); +static boolean_t blake3_is_avx512_supported(void) +{ + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); } -#if defined(_KERNEL) +const blake3_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif + +extern const blake3_ops_t blake3_generic_impl; + +static const blake3_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "blake3" +#define IMPL_OPS_T blake3_ops_t +#define IMPL_ARRAY blake3_impls +#define IMPL_GET_OPS blake3_get_ops + +#define ZFS_IMPL_OPS zfs_blake3_ops +#include + +#ifdef _KERNEL void **blake3_per_cpu_ctx; void @@ -227,58 +310,96 @@ blake3_per_cpu_ctx_fini(void) memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *)); kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *)); } -#endif -#if defined(_KERNEL) && defined(__linux__) +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + static int -icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +blake3_param_get(char *buffer, zfs_kernel_param_t *unused) { - char req_name[BLAKE3_IMPL_NAME_MAX]; - size_t i; + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; - /* sanitize input */ - i = strnlen(name, BLAKE3_IMPL_NAME_MAX); - if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) - return (-EINVAL); + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + generic_supp_impls[i]->name); + } - strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); - while (i > 0 && isspace(req_name[i-1])) - i--; - req_name[i] = '\0'; + return (cnt); +} - atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); - return (blake3_set_impl_name(req_name)); +static int +blake3_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); } +#elif defined(__FreeBSD__) + +#include + static int -icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +blake3_param(ZFS_MODULE_PARAM_ARGS) { - int i, cid, cnt = 0; - char *fmt; + int err; - /* cycling */ - fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; - cnt += sprintf(buffer + cnt, fmt); - - /* fastest one */ - fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; - cnt += sprintf(buffer + cnt, fmt); - - /* user selected */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - fmt = (icp_blake3_impl == IMPL_USER && - cid == blake3_current_id) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); - cid++; + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); } - buffer[cnt] = 0; + char buf[16]; - return (cnt); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); } +#endif + +#undef IMPL_FMT -module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, - NULL, 0644); -MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl, + blake3_param_set, blake3_param_get, ZMOD_RW, \ + "Select BLAKE3 implementation."); #endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index 5254061c7378..b65123013d57 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -25,16 +25,14 @@ * Copyright (c) 2021-2022 Tino Reichardt */ -#ifndef BLAKE3_IMPL_H +#ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #ifdef __cplusplus extern "C" { #endif -#include #include -#include /* * Methods used to define BLAKE3 assembler implementations @@ -55,39 +53,17 @@ typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, typedef boolean_t (*blake3_is_supported_f)(void); -typedef struct blake3_impl_ops { +typedef struct { blake3_compress_in_place_f compress_in_place; blake3_compress_xof_f compress_xof; blake3_hash_many_f hash_many; blake3_is_supported_f is_supported; int degree; const char *name; -} blake3_impl_ops_t; +} blake3_ops_t; -/* Return selected BLAKE3 implementation ops */ -extern const blake3_impl_ops_t *blake3_impl_get_ops(void); - -extern const blake3_impl_ops_t blake3_generic_impl; - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_impl_ops_t blake3_sse2_impl; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern const blake3_impl_ops_t blake3_sse41_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern const blake3_impl_ops_t blake3_avx2_impl; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern const blake3_impl_ops_t blake3_avx512_impl; -#endif +/* return selected BLAKE3 implementation ops */ +extern const blake3_ops_t *blake3_get_ops(void); #if defined(__x86_64) #define MAX_SIMD_DEGREE 16 diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c deleted file mode 100644 index a7552bdde4a8..000000000000 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright (c) 2021-2022 Tino Reichardt - */ - -#include "blake3_impl.h" - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse2(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse2(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse2_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse2_available()); -#elif defined(__PPC64__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_impl_ops_t blake3_sse2_impl = { - .compress_in_place = blake3_compress_in_place_sse2, - .compress_xof = blake3_compress_xof_sse2, - .hash_many = blake3_hash_many_sse2, - .is_supported = blake3_is_sse2_supported, - .degree = 4, - .name = "sse2" -}; -#endif - -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - -extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_sse41(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_sse41(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_sse41(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_sse41_supported(void) -{ -#if defined(__x86_64) - return (kfpu_allowed() && zfs_sse4_1_available()); -#elif defined(__PPC64__) - return (kfpu_allowed() && zfs_vsx_available()); -#else - return (kfpu_allowed()); -#endif -} - -const blake3_impl_ops_t blake3_sse41_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_sse41, - .is_supported = blake3_is_sse41_supported, - .degree = 4, - .name = "sse41" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_hash_many_avx2(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx2_supported(void) -{ - return (kfpu_allowed() && zfs_sse4_1_available() && - zfs_avx2_available()); -} - -const blake3_impl_ops_t blake3_avx2_impl = { - .compress_in_place = blake3_compress_in_place_sse41, - .compress_xof = blake3_compress_xof_sse41, - .hash_many = blake3_hash_many_avx2, - .is_supported = blake3_is_avx2_supported, - .degree = 8, - .name = "avx2" -}; -#endif - -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags); - -extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]); - -extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out); - -static void blake3_compress_in_place_avx512(uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags) { - kfpu_begin(); - zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, - flags); - kfpu_end(); -} - -static void blake3_compress_xof_avx512(const uint32_t cv[8], - const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, - uint64_t counter, uint8_t flags, uint8_t out[64]) { - kfpu_begin(); - zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, - out); - kfpu_end(); -} - -static void blake3_hash_many_avx512(const uint8_t * const *inputs, - size_t num_inputs, size_t blocks, const uint32_t key[8], - uint64_t counter, boolean_t increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { - kfpu_begin(); - zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, - increment_counter, flags, flags_start, flags_end, out); - kfpu_end(); -} - -static boolean_t blake3_is_avx512_supported(void) -{ - return (kfpu_allowed() && zfs_avx512f_available() && - zfs_avx512vl_available()); -} - -const blake3_impl_ops_t blake3_avx512_impl = { - .compress_in_place = blake3_compress_in_place_avx512, - .compress_xof = blake3_compress_xof_avx512, - .hash_many = blake3_hash_many_avx512, - .is_supported = blake3_is_avx512_supported, - .degree = 16, - .name = "avx512" -}; -#endif diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c new file mode 100644 index 000000000000..7acb5d058bcb --- /dev/null +++ b/module/icp/algs/sha2/sha256_impl.c @@ -0,0 +1,271 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include +#include + +#include "sha2_impl.h" + +#if defined(__x86_64) + +#if defined(HAVE_SSSE3) +static boolean_t sha2_have_ssse3(void) +{ + return (kfpu_allowed() && zfs_ssse3_available()); +} + +extern void sha256_transform_ssse3(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_ssse3_impl = { + .is_supported = sha2_have_ssse3, + .transform = sha256_transform_ssse3, + .name = "ssse3" +}; +#endif + +#if defined(HAVE_AVX) +static boolean_t sha2_have_avx(void) +{ + return (kfpu_allowed() && zfs_avx_available()); +} + +extern void sha256_transform_avx(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_avx_impl = { + .is_supported = sha2_have_avx, + .transform = sha256_transform_avx, + .name = "avx" +}; +#endif + +#if defined(HAVE_AVX2) +static boolean_t sha2_have_avx2(void) +{ + return (kfpu_allowed() && zfs_avx2_available()); +} + +extern void sha256_transform_avx2(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_avx2_impl = { + .is_supported = sha2_have_avx2, + .transform = sha256_transform_avx2, + .name = "avx2" +}; +#endif + +#if defined(HAVE_SSE4_1) +static boolean_t sha2_have_shani(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_shani_available()); +} + +extern void sha256_transform_shani(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_shani_impl = { + .is_supported = sha2_have_shani, + .transform = sha256_transform_shani, + .name = "shani" +}; +#endif + + +#endif /* __x86_64 */ + +#if defined(__aarch64__) +static boolean_t sha256_have_neon(void) +{ + return (kfpu_allowed()); +} + +extern void zfs_sha256_neon(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_neon_impl = { + .is_supported = sha256_have_neon, + .transform = zfs_sha256_neon, + .name = "neon" +}; + +static boolean_t sha256_have_armv8ce(void) +{ + return (kfpu_allowed()); +} + +extern void zfs_sha256_armv8ce(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_armv8_impl = { + .is_supported = sha256_have_armv8ce, + .transform = zfs_sha256_armv8ce, + .name = "armv8-ce" +}; +#endif /* __aarch64__ */ + +#if defined(__PPC64__) +static boolean_t sha256_have_vsx(void) +{ + return (kfpu_allowed() && zfs_vsx_available()); +} + +extern void zfs_sha256_power8(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_power8_impl = { + .is_supported = sha256_have_vsx, + .transform = zfs_sha256_power8, + .name = "power8" +}; + +extern void zfs_sha256_ppc(uint32_t s[8], const void *, size_t); +const sha256_ops_t sha256_ppc_impl = { + .transform = zfs_sha256_ppc, + .name = "ppc" +}; +#endif /* __PPC64__ */ + +/* the two generic ones */ +extern const sha256_ops_t sha256_generic_impl; + +/* array with all sha256 implementations */ +static const sha256_ops_t *const sha256_impls[] = { + &sha256_generic_impl, +#if defined(__x86_64) && defined(HAVE_SSSE3) + &sha256_ssse3_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX) + &sha256_avx_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) + &sha256_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) + &sha256_shani_impl, +#endif +#if defined(__aarch64__) + &sha256_neon_impl, + &sha256_armv8_impl, +#endif +#if defined(__PPC64__) + &sha256_ppc_impl, + &sha256_p8_impl, +#endif /* __PPC64__ */ +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "sha256" +#define IMPL_OPS_T sha256_ops_t +#define IMPL_ARRAY sha256_impls +#define IMPL_GET_OPS sha256_get_ops + +#define ZFS_IMPL_OPS zfs_sha256_ops +#include + +#ifdef _KERNEL + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + +static int +sha256_param_get(char *buffer, zfs_kernel_param_t *unused) +{ + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + generic_supp_impls[i]->name); + } + + return (cnt); +} + +static int +sha256_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); +} + +#elif defined(__FreeBSD__) + +#include + +static int +sha256_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); +} +#endif + +#undef IMPL_FMT + +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl, + sha256_param_set, sha256_param_get, ZMOD_RW, \ + "Select SHA256 implementation."); +#endif diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c new file mode 100644 index 000000000000..0708ee28c173 --- /dev/null +++ b/module/icp/algs/sha2/sha2_generic.c @@ -0,0 +1,564 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on public domain code in cppcrypto 0.10. + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include + +#include "sha2_impl.h" + +/* + * The low-level checksum routines use a lot of stack space. On systems where + * small stacks are enforced (like 32-bit kernel builds), insert compiler memory + * barriers to reduce stack frame size. + */ +#if defined(_ILP32) +#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory"); +#else +#define SMALL_STACK_MEMORY_BARRIER +#endif + +/* SHA256 */ +static const uint32_t SHA256_K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, + 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, + 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, + 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, + 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, + 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, + 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, + 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, + 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, + 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#ifndef cpu_to_be32 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define cpu_to_be32(x) __builtin_bswap32(x) +#define cpu_to_be64(x) __builtin_bswap64(x) +#else +#define cpu_to_be32(x) (x) +#define cpu_to_be64(x) (x) +#endif +#endif + +#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) +#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y)))) + +#define rotr32(x, n) (((x) >> n) | ((x) << (32 - n))) +#define rotr64(x, n) (((x) >> n) | ((x) << (64 - n))) + +#define sum0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22)) +#define sum1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25)) +#define sigma0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3)) +#define sigma1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10)) + +#define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) + W[(j + 9) & 15] \ + + sigma0(W[(j + 1) & 15])) + +#define COMPRESS_ROUND(i, j, K) \ + T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \ + T2 = sum0(a) + Maj(a, b, c); \ + h = g, g = f, f = e, e = d + T1; \ + d = c, c = b, b = a, a = T1 + T2; + +static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks) +{ + uint64_t blk; + + for (blk = 0; blk < num_blks; blk++) { + uint32_t W[16]; + uint32_t a, b, c, d, e, f, g, h; + uint32_t T1, T2; + int i; + + for (i = 0; i < 16; i++) { + W[i] = cpu_to_be32( \ + (((const uint32_t *)(data))[blk * 16 + i])); + } + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + for (i = 0; i <= 63; i += 16) { + COMPRESS_ROUND(i, 0, SHA256_K); + COMPRESS_ROUND(i, 1, SHA256_K); + COMPRESS_ROUND(i, 2, SHA256_K); + COMPRESS_ROUND(i, 3, SHA256_K); + COMPRESS_ROUND(i, 4, SHA256_K); + COMPRESS_ROUND(i, 5, SHA256_K); + COMPRESS_ROUND(i, 6, SHA256_K); + COMPRESS_ROUND(i, 7, SHA256_K); + COMPRESS_ROUND(i, 8, SHA256_K); + COMPRESS_ROUND(i, 9, SHA256_K); + COMPRESS_ROUND(i, 10, SHA256_K); + COMPRESS_ROUND(i, 11, SHA256_K); + COMPRESS_ROUND(i, 12, SHA256_K); + COMPRESS_ROUND(i, 13, SHA256_K); + COMPRESS_ROUND(i, 14, SHA256_K); + COMPRESS_ROUND(i, 15, SHA256_K); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } +} + +/* SHA512 */ +static const uint64_t SHA512_K[80] = { + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, + 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242, + 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, + 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275, + 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, + 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc, + 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, + 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218, + 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, + 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc, + 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, + 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba, + 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, + 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +}; + +#undef sum0 +#undef sum1 +#undef sigma0 +#undef sigma1 + +#define sum0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39)) +#define sum1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41)) +#define sigma0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7)) +#define sigma1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6)) + +static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks) +{ + uint64_t blk; + + for (blk = 0; blk < num_blks; blk++) { + uint64_t W[16]; + uint64_t a, b, c, d, e, f, g, h; + uint64_t T1, T2; + int i; + + for (i = 0; i < 16; i++) { + W[i] = cpu_to_be64( \ + (((const uint64_t *)(data))[blk * 16 + i])); + } + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + for (i = 0; i <= 79; i += 16) { + COMPRESS_ROUND(i, 0, SHA512_K); + COMPRESS_ROUND(i, 1, SHA512_K); + COMPRESS_ROUND(i, 2, SHA512_K); + COMPRESS_ROUND(i, 3, SHA512_K); + COMPRESS_ROUND(i, 4, SHA512_K); + + SMALL_STACK_MEMORY_BARRIER + COMPRESS_ROUND(i, 5, SHA512_K); + COMPRESS_ROUND(i, 6, SHA512_K); + COMPRESS_ROUND(i, 7, SHA512_K); + COMPRESS_ROUND(i, 8, SHA512_K); + COMPRESS_ROUND(i, 9, SHA512_K); + + SMALL_STACK_MEMORY_BARRIER + COMPRESS_ROUND(i, 10, SHA512_K); + COMPRESS_ROUND(i, 11, SHA512_K); + COMPRESS_ROUND(i, 12, SHA512_K); + COMPRESS_ROUND(i, 13, SHA512_K); + COMPRESS_ROUND(i, 14, SHA512_K); + COMPRESS_ROUND(i, 15, SHA512_K); + } + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } +} + +static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len) +{ + uint64_t pos = ctx->count[0]; + uint64_t total = ctx->count[1]; + uint8_t *m = ctx->wbuf; + const sha256_ops_t *ops = ctx->ops; + + if (pos && pos + len >= 64) { + memcpy(m + pos, data, 64 - pos); + ops->transform(ctx->state, m, 1); + len -= 64 - pos; + total += (64 - pos) * 8; + data += 64 - pos; + pos = 0; + } + + if (len >= 64) { + uint32_t blocks = len / 64; + uint32_t bytes = blocks * 64; + ops->transform(ctx->state, data, blocks); + len -= bytes; + total += (bytes) * 8; + data += bytes; + } + memcpy(m + pos, data, len); + + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; +} + +static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len) +{ + uint64_t pos = ctx->count[0]; + uint64_t total = ctx->count[1]; + uint8_t *m = ctx->wbuf; + const sha512_ops_t *ops = ctx->ops; + + if (pos && pos + len >= 128) { + memcpy(m + pos, data, (size_t)(128 - pos)); + ops->transform(ctx->state, m, 1); + len -= 128 - pos; + total += (128 - pos) * 8; + data += 128 - pos; + pos = 0; + } + + if (len >= 128) { + uint64_t blocks = len / 128; + uint64_t bytes = blocks * 128; + ops->transform(ctx->state, data, blocks); + len -= bytes; + total += (bytes) * 8; + data += bytes; + } + memcpy(m + pos, data, (size_t)(len)); + + pos += len; + total += len * 8; + ctx->count[0] = pos; + ctx->count[1] = total; +} + +static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits) +{ + uint64_t mlen, pos = ctx->count[0]; + uint8_t *m = ctx->wbuf; + uint32_t *R = (uint32_t *)result; + const sha256_ops_t *ops = ctx->ops; + + m[pos++] = 0x80; + if (pos > 56) { + memset(m + pos, 0, (size_t)(64 - pos)); + ops->transform(ctx->state, m, 1); + pos = 0; + } + + memset(m + pos, 0, (size_t)(56 - pos)); + mlen = cpu_to_be64(ctx->count[1]); + memcpy(m + (64 - 8), &mlen, 64 / 8); + ops->transform(ctx->state, m, 1); + + switch (bits) { + case 224: /* 28 */ + R[0] = cpu_to_be32(ctx->state[0]); + R[1] = cpu_to_be32(ctx->state[1]); + R[2] = cpu_to_be32(ctx->state[2]); + R[3] = cpu_to_be32(ctx->state[3]); + R[4] = cpu_to_be32(ctx->state[4]); + R[5] = cpu_to_be32(ctx->state[5]); + R[6] = cpu_to_be32(ctx->state[6]); + break; + case 256: /* 32 */ + R[0] = cpu_to_be32(ctx->state[0]); + R[1] = cpu_to_be32(ctx->state[1]); + R[2] = cpu_to_be32(ctx->state[2]); + R[3] = cpu_to_be32(ctx->state[3]); + R[4] = cpu_to_be32(ctx->state[4]); + R[5] = cpu_to_be32(ctx->state[5]); + R[6] = cpu_to_be32(ctx->state[6]); + R[7] = cpu_to_be32(ctx->state[7]); + break; + } + + memset(ctx, 0, sizeof (*ctx)); +} + +static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits) +{ + uint64_t mlen, pos = ctx->count[0]; + uint8_t *m = ctx->wbuf, *r; + uint64_t *R = (uint64_t *)result; + const sha512_ops_t *ops = ctx->ops; + + m[pos++] = 0x80; + if (pos > 112) { + memset(m + pos, 0, (size_t)(128 - pos)); + ops->transform(ctx->state, m, 1); + pos = 0; + } + + memset(m + pos, 0, (size_t)(128 - pos)); + mlen = cpu_to_be64(ctx->count[1]); + memcpy(m + (128 - 8), &mlen, 64 / 8); + ops->transform(ctx->state, m, 1); + + switch (bits) { + case 224: /* 28 => 3,5 x 8 */ + r = result + 24; + R[0] = cpu_to_be64(ctx->state[0]); + R[1] = cpu_to_be64(ctx->state[1]); + R[2] = cpu_to_be64(ctx->state[2]); + /* last 4 bytes are special here */ + *r++ = (uint8_t)(ctx->state[3] >> 56); + *r++ = (uint8_t)(ctx->state[3] >> 48); + *r++ = (uint8_t)(ctx->state[3] >> 40); + *r++ = (uint8_t)(ctx->state[3] >> 32); + break; + case 256: /* 32 */ + R[0] = cpu_to_be64(ctx->state[0]); + R[1] = cpu_to_be64(ctx->state[1]); + R[2] = cpu_to_be64(ctx->state[2]); + R[3] = cpu_to_be64(ctx->state[3]); + break; + case 384: /* 48 */ + R[0] = cpu_to_be64(ctx->state[0]); + R[1] = cpu_to_be64(ctx->state[1]); + R[2] = cpu_to_be64(ctx->state[2]); + R[3] = cpu_to_be64(ctx->state[3]); + R[4] = cpu_to_be64(ctx->state[4]); + R[5] = cpu_to_be64(ctx->state[5]); + break; + case 512: /* 64 */ + R[0] = cpu_to_be64(ctx->state[0]); + R[1] = cpu_to_be64(ctx->state[1]); + R[2] = cpu_to_be64(ctx->state[2]); + R[3] = cpu_to_be64(ctx->state[3]); + R[4] = cpu_to_be64(ctx->state[4]); + R[5] = cpu_to_be64(ctx->state[5]); + R[6] = cpu_to_be64(ctx->state[6]); + R[7] = cpu_to_be64(ctx->state[7]); + break; + } + + memset(ctx, 0, sizeof (*ctx)); +} + +/* SHA2 Init function */ +void +SHA2Init(SHA2_CTX *ctx, int algotype) +{ + ctx->algotype = algotype; + switch (ctx->algotype) { + case SHA224: + ctx->sha256.state[0] = 0xc1059ed8; + ctx->sha256.state[1] = 0x367cd507; + ctx->sha256.state[2] = 0x3070dd17; + ctx->sha256.state[3] = 0xf70e5939; + ctx->sha256.state[4] = 0xffc00b31; + ctx->sha256.state[5] = 0x68581511; + ctx->sha256.state[6] = 0x64f98fa7; + ctx->sha256.state[7] = 0xbefa4fa4; + ctx->sha256.count[0] = 0; + ctx->sha256.ops = sha256_get_ops(); + break; + case SHA256: + ctx->sha256.state[0] = 0x6a09e667; + ctx->sha256.state[1] = 0xbb67ae85; + ctx->sha256.state[2] = 0x3c6ef372; + ctx->sha256.state[3] = 0xa54ff53a; + ctx->sha256.state[4] = 0x510e527f; + ctx->sha256.state[5] = 0x9b05688c; + ctx->sha256.state[6] = 0x1f83d9ab; + ctx->sha256.state[7] = 0x5be0cd19; + ctx->sha256.count[0] = 0; + ctx->sha256.ops = sha256_get_ops(); + break; + case SHA384: + ctx->sha512.state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->sha512.state[1] = 0x629a292a367cd507ULL; + ctx->sha512.state[2] = 0x9159015a3070dd17ULL; + ctx->sha512.state[3] = 0x152fecd8f70e5939ULL; + ctx->sha512.state[4] = 0x67332667ffc00b31ULL; + ctx->sha512.state[5] = 0x8eb44a8768581511ULL; + ctx->sha512.state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->sha512.state[7] = 0x47b5481dbefa4fa4ULL; + ctx->sha512.count[0] = 0; + ctx->sha512.count[1] = 0; + ctx->sha512.ops = sha512_get_ops(); + break; + case SHA512: + ctx->sha512.state[0] = 0x6a09e667f3bcc908ULL; + ctx->sha512.state[1] = 0xbb67ae8584caa73bULL; + ctx->sha512.state[2] = 0x3c6ef372fe94f82bULL; + ctx->sha512.state[3] = 0xa54ff53a5f1d36f1ULL; + ctx->sha512.state[4] = 0x510e527fade682d1ULL; + ctx->sha512.state[5] = 0x9b05688c2b3e6c1fULL; + ctx->sha512.state[6] = 0x1f83d9abfb41bd6bULL; + ctx->sha512.state[7] = 0x5be0cd19137e2179ULL; + ctx->sha512.count[0] = 0; + ctx->sha512.count[1] = 0; + ctx->sha512.ops = sha512_get_ops(); + break; + case SHA512_224: + ctx->sha512.state[0] = 0x8c3d37c819544da2ULL; + ctx->sha512.state[1] = 0x73e1996689dcd4d6ULL; + ctx->sha512.state[2] = 0x1dfab7ae32ff9c82ULL; + ctx->sha512.state[3] = 0x679dd514582f9fcfULL; + ctx->sha512.state[4] = 0x0f6d2b697bd44da8ULL; + ctx->sha512.state[5] = 0x77e36f7304c48942ULL; + ctx->sha512.state[6] = 0x3f9d85a86a1d36c8ULL; + ctx->sha512.state[7] = 0x1112e6ad91d692a1ULL; + ctx->sha512.count[0] = 0; + ctx->sha512.count[1] = 0; + ctx->sha512.ops = sha512_get_ops(); + break; + case SHA512_256: + ctx->sha512.state[0] = 0x22312194fc2bf72cULL; + ctx->sha512.state[1] = 0x9f555fa3c84c64c2ULL; + ctx->sha512.state[2] = 0x2393b86b6f53b151ULL; + ctx->sha512.state[3] = 0x963877195940eabdULL; + ctx->sha512.state[4] = 0x96283ee2a88effe3ULL; + ctx->sha512.state[5] = 0xbe5e1e2553863992ULL; + ctx->sha512.state[6] = 0x2b0199fc2c85b8aaULL; + ctx->sha512.state[7] = 0x0eb72ddc81c52ca2ULL; + ctx->sha512.count[0] = 0; + ctx->sha512.count[1] = 0; + ctx->sha512.ops = sha512_get_ops(); + break; + } +} + +/* SHA2 Update function */ +void +SHA2Update(SHA2_CTX *ctx, const void *data, size_t len) +{ + switch (ctx->algotype) { + case SHA224: + sha256_update(&ctx->sha256, data, len); + break; + case SHA256: + sha256_update(&ctx->sha256, data, len); + break; + case SHA384: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512_224: + sha512_update(&ctx->sha512, data, len); + break; + case SHA512_256: + sha512_update(&ctx->sha512, data, len); + break; + } +} + +/* SHA2Final function */ +void +SHA2Final(SHA2_CTX *ctx, void *digest) +{ + switch (ctx->algotype) { + case SHA224: + sha256_final(&ctx->sha256, digest, 224); + break; + case SHA256: + sha256_final(&ctx->sha256, digest, 256); + break; + case SHA384: + sha512_final(&ctx->sha512, digest, 384); + break; + case SHA512: + sha512_final(&ctx->sha512, digest, 512); + break; + case SHA512_224: + sha512_final(&ctx->sha512, digest, 224); + break; + case SHA512_256: + sha512_final(&ctx->sha512, digest, 256); + break; + } +} + +/* the generic implementation is always okay */ +static boolean_t sha2_is_supported(void) +{ + return (B_TRUE); +} + +const sha256_ops_t sha256_generic_impl = { + .name = "generic", + .transform = sha256_generic, + .is_supported = sha2_is_supported +}; + +const sha512_ops_t sha512_generic_impl = { + .name = "generic", + .transform = sha512_generic, + .is_supported = sha2_is_supported +}; diff --git a/module/icp/algs/sha2/sha2_impl.h b/module/icp/algs/sha2/sha2_impl.h new file mode 100644 index 000000000000..daab1983f218 --- /dev/null +++ b/module/icp/algs/sha2/sha2_impl.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#ifndef SHA2_IMPL_H +#define SHA2_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* transform function definition */ +typedef void (*sha256_f)(uint32_t state[8], const void *data, size_t blks); +typedef void (*sha512_f)(uint64_t state[8], const void *data, size_t blks); + +/* needed for checking valid implementations */ +typedef boolean_t (*sha2_is_supported_f)(void); + +typedef struct { + const char *name; + sha256_f transform; + sha2_is_supported_f is_supported; +} sha256_ops_t; + +typedef struct { + const char *name; + sha512_f transform; + sha2_is_supported_f is_supported; +} sha512_ops_t; + +extern const sha256_ops_t *sha256_get_ops(void); +extern const sha512_ops_t *sha512_get_ops(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SHA2_IMPL_H */ diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c new file mode 100644 index 000000000000..c779d4d9fca6 --- /dev/null +++ b/module/icp/algs/sha2/sha512_impl.c @@ -0,0 +1,223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include +#include + +#include "sha2_impl.h" + +#if defined(__x86_64) + +#if defined(HAVE_AVX) +static boolean_t sha2_have_avx(void) +{ + return (kfpu_allowed() && zfs_avx_available()); +} + +extern void sha512_transform_avx(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_avx_impl = { + .is_supported = sha2_have_avx, + .transform = sha512_transform_avx, + .name = "avx" +}; +#endif + +#if defined(HAVE_AVX2) +static boolean_t sha2_have_avx2(void) +{ + return (kfpu_allowed() && zfs_avx2_available()); +} + +extern void sha512_transform_avx2(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_avx2_impl = { + .is_supported = sha2_have_avx2, + .transform = sha512_transform_avx2, + .name = "avx2" +}; +#endif + +#endif /* __x86_64 */ + +#if defined(__aarch64__) +static boolean_t sha512_have_armv8ce(void) +{ + return (kfpu_allowed()); +} + +extern void zfs_sha512_armv8ce(uint64_t s[8], const void *, size_t); +const sha512_ops_t sha512_armv8_impl = { + .is_supported = sha512_have_armv8ce, + .transform = zfs_sha512_armv8ce, + .name = "armv8-ce" +}; +#endif /* __aarch64__ */ + +#if defined(__PPC64__) +static boolean_t sha512_have_vsx(void) +{ + return (kfpu_allowed() && zfs_vsx_available()); +} + +extern void zfs_sha512_power8(uint32_t s[8], const void *, size_t); +const sha512_ops_t sha512_power8_impl = { + .is_supported = sha512_have_vsx, + .transform = zfs_sha512_power8, + .name = "power8" +}; + +extern void zfs_sha512_ppc(uint32_t s[8], const void *, size_t); +const sha512_ops_t sha512_ppc_impl = { + .transform = zfs_sha512_ppc, + .name = "ppc" +}; +#endif /* __PPC64__ */ + +/* the two generic ones */ +extern const sha512_ops_t sha512_generic_impl; + +/* array with all sha512 implementations */ +static const sha512_ops_t *const sha512_impls[] = { + &sha512_generic_impl, +#if defined(__x86_64) && defined(HAVE_AVX) + &sha512_avx_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) + &sha512_avx2_impl, +#endif +#if defined(__aarch64__) + &sha512_armv8_impl, +#endif +#if defined(__PPC64__) + &sha512_ppc_impl, + &sha512_p8_impl, +#endif /* __PPC64__ */ +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "sha512" +#define IMPL_OPS_T sha512_ops_t +#define IMPL_ARRAY sha512_impls +#define IMPL_GET_OPS sha512_get_ops + +#define ZFS_IMPL_OPS zfs_sha512_ops +#include + +#ifdef _KERNEL + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + +static int +sha512_param_get(char *buffer, zfs_kernel_param_t *unused) +{ + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += sprintf(buffer + cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += sprintf(buffer + cnt, fmt, + generic_supp_impls[i]->name); + } + + return (cnt); +} + +static int +sha512_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); +} + +#elif defined(__FreeBSD__) + +#include + +static int +sha512_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + /* we got module parameter */ + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); +} +#endif + +#undef IMPL_FMT + +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl, + sha512_param_set, sha512_param_get, ZMOD_RW, \ + "Select SHA512 implementation."); +#endif diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S new file mode 100644 index 000000000000..1f98f9585a49 --- /dev/null +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -0,0 +1,1020 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__aarch64__) + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +.globl zfs_sha256_armv8ce +.type zfs_sha256_armv8ce,%function +.align 6 +zfs_sha256_armv8ce: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size zfs_sha256_armv8ce,.-zfs_sha256_armv8ce + +.globl zfs_sha256_neon +.type zfs_sha256_neon,%function +.align 4 +zfs_sha256_neon: +.Lneon_entry: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size zfs_sha256_neon,.-zfs_sha256_neon +#endif diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S new file mode 100644 index 000000000000..d82084f02abc --- /dev/null +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -0,0 +1,580 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__aarch64__) + +.text +.align 6 +.type .LK512,%object +.LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator +.size .LK512,.-.LK512 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +.globl zfs_sha512_armv8ce +.type zfs_sha512_armv8ce,%function +.align 6 +zfs_sha512_armv8ce: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adr x3,.LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b .Loop_hw + +.align 4 +.Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.16b,v17.16b + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.16b,v18.16b + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.16b,v19.16b + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.16b,v20.16b + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b +.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b +.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b +.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b +.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.16b,v16.16b + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b +.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,.Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret +.size zfs_sha512_armv8ce,.-zfs_sha512_armv8ce +#endif diff --git a/module/icp/asm-ppc64/sha2/sha256-p8.S b/module/icp/asm-ppc64/sha2/sha256-p8.S new file mode 100644 index 000000000000..6e18f4e9752b --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha256-p8.S @@ -0,0 +1,1498 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha256_power8 +.type zfs_sha256_power8,@function +.section ".opd","aw" +.align 3 +zfs_sha256_power8: +.quad .zfs_sha256_power8,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha256_power8: + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + mfspr 12,256 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + mtspr 256,11 + + bl .LPICmeup + addi 11,1,79 + .long 0x7C001E19 + .long 0x7C8A1E19 + vsldoi 1,0,0,4 + vsldoi 2,0,0,8 + vsldoi 3,0,0,12 + vsldoi 5,4,4,4 + vsldoi 6,4,4,8 + vsldoi 7,4,4,12 + li 0,3 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + vadduwm 7,7,28 + lvx 28,10,6 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 9,8,8,4 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 10,9,9,4 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,4 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 13,12,12,4 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 14,13,13,4 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,4 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 17,16,16,4 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 18,17,17,4 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,4 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 25,24,24,4 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 26,25,25,4 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + vsldoi 27,26,26,4 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA0682 + vadduwm 9,9,30 + .long 0x13DB7E82 + vadduwm 9,9,30 + vadduwm 9,9,18 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13CB0682 + vadduwm 10,10,30 + .long 0x13C87E82 + vadduwm 10,10,30 + vadduwm 10,10,19 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13CC0682 + vadduwm 11,11,30 + .long 0x13C97E82 + vadduwm 11,11,30 + vadduwm 11,11,24 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13CD0682 + vadduwm 12,12,30 + .long 0x13CA7E82 + vadduwm 12,12,30 + vadduwm 12,12,25 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13CE0682 + vadduwm 13,13,30 + .long 0x13CB7E82 + vadduwm 13,13,30 + vadduwm 13,13,26 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13CF0682 + vadduwm 14,14,30 + .long 0x13CC7E82 + vadduwm 14,14,30 + vadduwm 14,14,27 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D00682 + vadduwm 15,15,30 + .long 0x13CD7E82 + vadduwm 15,15,30 + vadduwm 15,15,8 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13D10682 + vadduwm 16,16,30 + .long 0x13CE7E82 + vadduwm 16,16,30 + vadduwm 16,16,9 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + .long 0x13D20682 + vadduwm 17,17,30 + .long 0x13CF7E82 + vadduwm 17,17,30 + vadduwm 17,17,10 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13D30682 + vadduwm 18,18,30 + .long 0x13D07E82 + vadduwm 18,18,30 + vadduwm 18,18,11 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13D80682 + vadduwm 19,19,30 + .long 0x13D17E82 + vadduwm 19,19,30 + vadduwm 19,19,12 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13D90682 + vadduwm 24,24,30 + .long 0x13D27E82 + vadduwm 24,24,30 + vadduwm 24,24,13 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13DA0682 + vadduwm 25,25,30 + .long 0x13D37E82 + vadduwm 25,25,30 + vadduwm 25,25,14 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13DB0682 + vadduwm 26,26,30 + .long 0x13D87E82 + vadduwm 26,26,30 + vadduwm 26,26,15 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C80682 + vadduwm 27,27,30 + .long 0x13D97E82 + vadduwm 27,27,30 + vadduwm 27,27,16 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + vadduwm 0,0,10 + lvx 12,26,11 + vadduwm 1,1,11 + lvx 13,27,11 + vadduwm 2,2,12 + lvx 14,28,11 + vadduwm 3,3,13 + lvx 15,29,11 + vadduwm 4,4,14 + lvx 16,30,11 + vadduwm 5,5,15 + lvx 17,31,11 + vadduwm 6,6,16 + vadduwm 7,7,17 + bne .Loop + lvx 8,26,7 + vperm 0,0,1,28 + lvx 9,27,7 + vperm 4,4,5,28 + vperm 0,0,2,8 + vperm 4,4,6,8 + vperm 0,0,3,9 + vperm 4,4,7,9 + .long 0x7C001F19 + .long 0x7C8A1F19 + addi 11,1,207 + mtlr 8 + mtspr 256,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size .zfs_sha256_power8,.-.zfs_sha256_power8 +.size zfs_sha256_power8,.-.zfs_sha256_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98 +.long 0x71374491,0x71374491,0x71374491,0x71374491 +.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf +.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5 +.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b +.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1 +.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4 +.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5 +.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98 +.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01 +.long 0x243185be,0x243185be,0x243185be,0x243185be +.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3 +.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74 +.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe +.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7 +.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174 +.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1 +.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786 +.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6 +.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc +.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f +.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa +.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc +.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da +.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152 +.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d +.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8 +.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7 +.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3 +.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147 +.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351 +.long 0x14292967,0x14292967,0x14292967,0x14292967 +.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85 +.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138 +.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc +.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13 +.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354 +.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb +.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e +.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85 +.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1 +.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b +.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70 +.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3 +.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819 +.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624 +.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585 +.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070 +.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116 +.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08 +.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c +.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5 +.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3 +.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a +.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f +.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3 +.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee +.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f +.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814 +.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208 +.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa +.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb +.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7 +.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2 +.long 0,0,0,0 +.long 0x00010203,0x10111213,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x10111213,0x10111213 +.long 0x00010203,0x04050607,0x08090a0b,0x10111213 +.byte 83,72,65,50,53,54,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha256_power8 +.type zfs_sha256_power8,@function +.align 6 +zfs_sha256_power8: +.localentry zfs_sha256_power8,0 + + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + li 12,-1 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + or 11,11,11 + + bl .LPICmeup + addi 11,1,79 + li 7,8 + lvsl 31,0,7 + vspltisb 28,0x0f + vxor 31,31,28 + .long 0x7C001E19 + .long 0x7C8A1E19 + vsldoi 1,0,0,4 + vsldoi 2,0,0,8 + vsldoi 3,0,0,12 + vsldoi 5,4,4,4 + vsldoi 6,4,4,8 + vsldoi 7,4,4,12 + li 0,3 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + vadduwm 7,7,28 + lvx 28,10,6 + vperm 8,8,8,31 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 9,8,8,4 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 10,9,9,4 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,4 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vperm 12,12,12,31 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 13,12,12,4 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 14,13,13,4 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,4 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + vperm 16,16,16,31 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + vsldoi 17,16,16,4 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + vsldoi 18,17,17,4 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,4 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + vperm 24,24,24,31 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + vsldoi 25,24,24,4 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + vsldoi 26,25,25,4 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + vsldoi 27,26,26,4 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA0682 + vadduwm 9,9,30 + .long 0x13DB7E82 + vadduwm 9,9,30 + vadduwm 9,9,18 + vadduwm 7,7,8 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13CB0682 + vadduwm 10,10,30 + .long 0x13C87E82 + vadduwm 10,10,30 + vadduwm 10,10,19 + vadduwm 6,6,9 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13CC0682 + vadduwm 11,11,30 + .long 0x13C97E82 + vadduwm 11,11,30 + vadduwm 11,11,24 + vadduwm 5,5,10 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13CD0682 + vadduwm 12,12,30 + .long 0x13CA7E82 + vadduwm 12,12,30 + vadduwm 12,12,25 + vadduwm 4,4,11 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13CE0682 + vadduwm 13,13,30 + .long 0x13CB7E82 + vadduwm 13,13,30 + vadduwm 13,13,26 + vadduwm 3,3,12 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13CF0682 + vadduwm 14,14,30 + .long 0x13CC7E82 + vadduwm 14,14,30 + vadduwm 14,14,27 + vadduwm 2,2,13 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D00682 + vadduwm 15,15,30 + .long 0x13CD7E82 + vadduwm 15,15,30 + vadduwm 15,15,8 + vadduwm 1,1,14 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13D10682 + vadduwm 16,16,30 + .long 0x13CE7E82 + vadduwm 16,16,30 + vadduwm 16,16,9 + vadduwm 0,0,15 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + .long 0x13D20682 + vadduwm 17,17,30 + .long 0x13CF7E82 + vadduwm 17,17,30 + vadduwm 17,17,10 + vadduwm 7,7,16 + vsel 29,6,5,4 + vadduwm 6,6,28 + vadduwm 7,7,29 + .long 0x13C4FE82 + vadduwm 7,7,30 + vxor 29,0,1 + vsel 29,1,2,29 + vadduwm 3,3,7 + .long 0x13C08682 + vadduwm 30,30,29 + vadduwm 7,7,30 + lvx 28,26,7 + .long 0x13D30682 + vadduwm 18,18,30 + .long 0x13D07E82 + vadduwm 18,18,30 + vadduwm 18,18,11 + vadduwm 6,6,17 + vsel 29,5,4,3 + vadduwm 5,5,28 + vadduwm 6,6,29 + .long 0x13C3FE82 + vadduwm 6,6,30 + vxor 29,7,0 + vsel 29,0,1,29 + vadduwm 2,2,6 + .long 0x13C78682 + vadduwm 30,30,29 + vadduwm 6,6,30 + lvx 28,27,7 + .long 0x13D80682 + vadduwm 19,19,30 + .long 0x13D17E82 + vadduwm 19,19,30 + vadduwm 19,19,12 + vadduwm 5,5,18 + vsel 29,4,3,2 + vadduwm 4,4,28 + vadduwm 5,5,29 + .long 0x13C2FE82 + vadduwm 5,5,30 + vxor 29,6,7 + vsel 29,7,0,29 + vadduwm 1,1,5 + .long 0x13C68682 + vadduwm 30,30,29 + vadduwm 5,5,30 + lvx 28,28,7 + .long 0x13D90682 + vadduwm 24,24,30 + .long 0x13D27E82 + vadduwm 24,24,30 + vadduwm 24,24,13 + vadduwm 4,4,19 + vsel 29,3,2,1 + vadduwm 3,3,28 + vadduwm 4,4,29 + .long 0x13C1FE82 + vadduwm 4,4,30 + vxor 29,5,6 + vsel 29,6,7,29 + vadduwm 0,0,4 + .long 0x13C58682 + vadduwm 30,30,29 + vadduwm 4,4,30 + lvx 28,29,7 + .long 0x13DA0682 + vadduwm 25,25,30 + .long 0x13D37E82 + vadduwm 25,25,30 + vadduwm 25,25,14 + vadduwm 3,3,24 + vsel 29,2,1,0 + vadduwm 2,2,28 + vadduwm 3,3,29 + .long 0x13C0FE82 + vadduwm 3,3,30 + vxor 29,4,5 + vsel 29,5,6,29 + vadduwm 7,7,3 + .long 0x13C48682 + vadduwm 30,30,29 + vadduwm 3,3,30 + lvx 28,30,7 + .long 0x13DB0682 + vadduwm 26,26,30 + .long 0x13D87E82 + vadduwm 26,26,30 + vadduwm 26,26,15 + vadduwm 2,2,25 + vsel 29,1,0,7 + vadduwm 1,1,28 + vadduwm 2,2,29 + .long 0x13C7FE82 + vadduwm 2,2,30 + vxor 29,3,4 + vsel 29,4,5,29 + vadduwm 6,6,2 + .long 0x13C38682 + vadduwm 30,30,29 + vadduwm 2,2,30 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C80682 + vadduwm 27,27,30 + .long 0x13D97E82 + vadduwm 27,27,30 + vadduwm 27,27,16 + vadduwm 1,1,26 + vsel 29,0,7,6 + vadduwm 0,0,28 + vadduwm 1,1,29 + .long 0x13C6FE82 + vadduwm 1,1,30 + vxor 29,2,3 + vsel 29,3,4,29 + vadduwm 5,5,1 + .long 0x13C28682 + vadduwm 30,30,29 + vadduwm 1,1,30 + lvx 28,0,7 + .long 0x13C90682 + vadduwm 8,8,30 + .long 0x13DA7E82 + vadduwm 8,8,30 + vadduwm 8,8,17 + vadduwm 0,0,27 + vsel 29,7,6,5 + vadduwm 7,7,28 + vadduwm 0,0,29 + .long 0x13C5FE82 + vadduwm 0,0,30 + vxor 29,1,2 + vsel 29,2,3,29 + vadduwm 4,4,0 + .long 0x13C18682 + vadduwm 30,30,29 + vadduwm 0,0,30 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + vadduwm 0,0,10 + lvx 12,26,11 + vadduwm 1,1,11 + lvx 13,27,11 + vadduwm 2,2,12 + lvx 14,28,11 + vadduwm 3,3,13 + lvx 15,29,11 + vadduwm 4,4,14 + lvx 16,30,11 + vadduwm 5,5,15 + lvx 17,31,11 + vadduwm 6,6,16 + vadduwm 7,7,17 + bne .Loop + lvx 8,26,7 + vperm 0,0,1,28 + lvx 9,27,7 + vperm 4,4,5,28 + vperm 0,0,2,8 + vperm 4,4,6,8 + vperm 0,0,3,9 + vperm 4,4,7,9 + .long 0x7C001F19 + .long 0x7C8A1F19 + addi 11,1,207 + mtlr 8 + or 12,12,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size zfs_sha256_power8,.-zfs_sha256_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98 +.long 0x71374491,0x71374491,0x71374491,0x71374491 +.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf +.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5 +.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b +.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1 +.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4 +.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5 +.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98 +.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01 +.long 0x243185be,0x243185be,0x243185be,0x243185be +.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3 +.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74 +.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe +.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7 +.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174 +.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1 +.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786 +.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6 +.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc +.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f +.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa +.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc +.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da +.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152 +.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d +.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8 +.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7 +.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3 +.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147 +.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351 +.long 0x14292967,0x14292967,0x14292967,0x14292967 +.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85 +.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138 +.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc +.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13 +.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354 +.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb +.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e +.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85 +.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1 +.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b +.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70 +.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3 +.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819 +.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624 +.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585 +.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070 +.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116 +.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08 +.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c +.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5 +.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3 +.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a +.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f +.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3 +.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee +.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f +.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814 +.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208 +.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa +.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb +.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7 +.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2 +.long 0,0,0,0 +.long 0x10111213,0x10111213,0x10111213,0x00010203 +.long 0x10111213,0x10111213,0x04050607,0x00010203 +.long 0x10111213,0x08090a0b,0x04050607,0x00010203 +.byte 83,72,65,50,53,54,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif diff --git a/module/icp/asm-ppc64/sha2/sha256-ppc.S b/module/icp/asm-ppc64/sha2/sha256-ppc.S new file mode 100644 index 000000000000..257edce0785f --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha256-ppc.S @@ -0,0 +1,2705 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha256_ppc +.type zfs_sha256_ppc,@function +.section ".opd","aw" +.align 3 +zfs_sha256_ppc: +.quad .zfs_sha256_ppc,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha256_ppc: + stdu 1,-320(1) + mflr 0 + sldi 5,5,6 + + std 3,144(1) + + std 14,176(1) + std 15,184(1) + std 16,192(1) + std 17,200(1) + std 18,208(1) + std 19,216(1) + std 20,224(1) + std 21,232(1) + std 22,240(1) + std 23,248(1) + std 24,256(1) + std 25,264(1) + std 26,272(1) + std 27,280(1) + std 28,288(1) + std 29,296(1) + std 30,304(1) + std 31,312(1) + std 0,336(1) + lwz 8,0(3) + mr 31,4 + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 12,16(3) + lwz 6,20(3) + lwz 14,24(3) + lwz 15,28(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,128(1) + std 31,136(1) + bl .Lsha2_block_private + b .Ldone + + + + + + + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,4032 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + + ld 5,120(1) +.Lcross_page: + li 0,16 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,112(1) + addi 0,1,112 + addi 31,1,48 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + ld 31,112(1) + ld 5,120(1) + addic. 5,5,-64 + bne .Lunaligned + +.Ldone: + ld 0,336(1) + ld 14,176(1) + ld 15,184(1) + ld 16,192(1) + ld 17,200(1) + ld 18,208(1) + ld 19,216(1) + ld 20,224(1) + ld 21,232(1) + ld 22,240(1) + ld 23,248(1) + ld 24,256(1) + ld 25,264(1) + ld 26,272(1) + ld 27,280(1) + ld 28,288(1) + ld 29,296(1) + ld 30,304(1) + ld 31,312(1) + mtlr 0 + addi 1,1,320 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + lwz 0,0(7) + lwz 16,0(31) + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,4(7) + add 15,15,3 + add 15,15,5 + + lwz 17,4(31) + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,8(7) + add 14,14,3 + add 14,14,5 + + lwz 18,8(31) + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,12(7) + add 6,6,3 + add 6,6,5 + + lwz 19,12(31) + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,16(7) + add 12,12,3 + add 12,12,5 + + lwz 20,16(31) + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,20(7) + add 11,11,3 + add 11,11,5 + + lwz 21,20(31) + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,24(7) + add 10,10,3 + add 10,10,5 + + lwz 22,24(31) + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,28(7) + add 9,9,3 + add 9,9,5 + + lwz 23,28(31) + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + lwz 0,32(7) + add 8,8,3 + add 8,8,5 + + lwz 24,32(31) + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,36(7) + add 15,15,3 + add 15,15,5 + + lwz 25,36(31) + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,40(7) + add 14,14,3 + add 14,14,5 + + lwz 26,40(31) + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,44(7) + add 6,6,3 + add 6,6,5 + + lwz 27,44(31) + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,48(7) + add 12,12,3 + add 12,12,5 + + lwz 28,48(31) + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,52(7) + add 11,11,3 + add 11,11,5 + + lwz 29,52(31) + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,56(7) + add 10,10,3 + add 10,10,5 + + lwz 30,56(31) + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,60(7) + add 9,9,3 + add 9,9,5 + + lwz 31,60(31) + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,3 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,64 + rotrwi 3,17,7 + rotrwi 4,17,18 + rotrwi 5,30,17 + rotrwi 0,30,19 + xor 3,3,4 + srwi 4,17,3 + xor 5,5,0 + srwi 0,30,10 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + lwz 0,0(7) + add 16,16,3 + add 16,16,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,18,7 + rotrwi 4,18,18 + rotrwi 5,31,17 + rotrwi 0,31,19 + xor 3,3,4 + srwi 4,18,3 + xor 5,5,0 + srwi 0,31,10 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + lwz 0,4(7) + add 17,17,3 + add 17,17,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,19,7 + rotrwi 4,19,18 + rotrwi 5,16,17 + rotrwi 0,16,19 + xor 3,3,4 + srwi 4,19,3 + xor 5,5,0 + srwi 0,16,10 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + lwz 0,8(7) + add 18,18,3 + add 18,18,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,20,7 + rotrwi 4,20,18 + rotrwi 5,17,17 + rotrwi 0,17,19 + xor 3,3,4 + srwi 4,20,3 + xor 5,5,0 + srwi 0,17,10 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + lwz 0,12(7) + add 19,19,3 + add 19,19,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,21,7 + rotrwi 4,21,18 + rotrwi 5,18,17 + rotrwi 0,18,19 + xor 3,3,4 + srwi 4,21,3 + xor 5,5,0 + srwi 0,18,10 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + lwz 0,16(7) + add 20,20,3 + add 20,20,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,22,7 + rotrwi 4,22,18 + rotrwi 5,19,17 + rotrwi 0,19,19 + xor 3,3,4 + srwi 4,22,3 + xor 5,5,0 + srwi 0,19,10 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + lwz 0,20(7) + add 21,21,3 + add 21,21,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,23,7 + rotrwi 4,23,18 + rotrwi 5,20,17 + rotrwi 0,20,19 + xor 3,3,4 + srwi 4,23,3 + xor 5,5,0 + srwi 0,20,10 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + lwz 0,24(7) + add 22,22,3 + add 22,22,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,24,7 + rotrwi 4,24,18 + rotrwi 5,21,17 + rotrwi 0,21,19 + xor 3,3,4 + srwi 4,24,3 + xor 5,5,0 + srwi 0,21,10 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + lwz 0,28(7) + add 23,23,3 + add 23,23,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrwi 3,25,7 + rotrwi 4,25,18 + rotrwi 5,22,17 + rotrwi 0,22,19 + xor 3,3,4 + srwi 4,25,3 + xor 5,5,0 + srwi 0,22,10 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + lwz 0,32(7) + add 24,24,3 + add 24,24,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,26,7 + rotrwi 4,26,18 + rotrwi 5,23,17 + rotrwi 0,23,19 + xor 3,3,4 + srwi 4,26,3 + xor 5,5,0 + srwi 0,23,10 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + lwz 0,36(7) + add 25,25,3 + add 25,25,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,27,7 + rotrwi 4,27,18 + rotrwi 5,24,17 + rotrwi 0,24,19 + xor 3,3,4 + srwi 4,27,3 + xor 5,5,0 + srwi 0,24,10 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + lwz 0,40(7) + add 26,26,3 + add 26,26,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,28,7 + rotrwi 4,28,18 + rotrwi 5,25,17 + rotrwi 0,25,19 + xor 3,3,4 + srwi 4,28,3 + xor 5,5,0 + srwi 0,25,10 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + lwz 0,44(7) + add 27,27,3 + add 27,27,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,29,7 + rotrwi 4,29,18 + rotrwi 5,26,17 + rotrwi 0,26,19 + xor 3,3,4 + srwi 4,29,3 + xor 5,5,0 + srwi 0,26,10 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + lwz 0,48(7) + add 28,28,3 + add 28,28,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,30,7 + rotrwi 4,30,18 + rotrwi 5,27,17 + rotrwi 0,27,19 + xor 3,3,4 + srwi 4,30,3 + xor 5,5,0 + srwi 0,27,10 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + lwz 0,52(7) + add 29,29,3 + add 29,29,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,31,7 + rotrwi 4,31,18 + rotrwi 5,28,17 + rotrwi 0,28,19 + xor 3,3,4 + srwi 4,31,3 + xor 5,5,0 + srwi 0,28,10 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + lwz 0,56(7) + add 30,30,3 + add 30,30,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,16,7 + rotrwi 4,16,18 + rotrwi 5,29,17 + rotrwi 0,29,19 + xor 3,3,4 + srwi 4,16,3 + xor 5,5,0 + srwi 0,29,10 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + lwz 0,60(7) + add 31,31,3 + add 31,31,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,144(1) + ld 31,136(1) + ld 5,128(1) + subi 7,7,192 + + lwz 16,0(3) + lwz 17,4(3) + lwz 18,8(3) + lwz 19,12(3) + lwz 20,16(3) + lwz 21,20(3) + lwz 22,24(3) + addi 31,31,64 + lwz 23,28(3) + add 8,8,16 + add 9,9,17 + std 31,136(1) + add 10,10,18 + stw 8,0(3) + add 11,11,19 + stw 9,4(3) + add 12,12,20 + stw 10,8(3) + add 6,6,21 + stw 11,12(3) + add 14,14,22 + stw 12,16(3) + add 15,15,23 + stw 6,20(3) + stw 14,24(3) + cmpld 31,5 + stw 15,28(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size .zfs_sha256_ppc,.-.zfs_sha256_ppc +.size zfs_sha256_ppc,.-.zfs_sha256_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha256_ppc +.type zfs_sha256_ppc,@function +.align 6 +zfs_sha256_ppc: +.localentry zfs_sha256_ppc,0 + + stdu 1,-320(1) + mflr 0 + sldi 5,5,6 + + std 3,144(1) + + std 14,176(1) + std 15,184(1) + std 16,192(1) + std 17,200(1) + std 18,208(1) + std 19,216(1) + std 20,224(1) + std 21,232(1) + std 22,240(1) + std 23,248(1) + std 24,256(1) + std 25,264(1) + std 26,272(1) + std 27,280(1) + std 28,288(1) + std 29,296(1) + std 30,304(1) + std 31,312(1) + std 0,336(1) + lwz 8,0(3) + mr 31,4 + lwz 9,4(3) + lwz 10,8(3) + lwz 11,12(3) + lwz 12,16(3) + lwz 6,20(3) + lwz 14,24(3) + lwz 15,28(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,128(1) + std 31,136(1) + bl .Lsha2_block_private + b .Ldone + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,4032 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + + ld 5,120(1) +.Lcross_page: + li 0,16 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,112(1) + addi 0,1,112 + addi 31,1,48 + std 5,120(1) + std 0,128(1) + std 31,136(1) + bl .Lsha2_block_private + ld 31,112(1) + ld 5,120(1) + addic. 5,5,-64 + bne .Lunaligned + +.Ldone: + ld 0,336(1) + ld 14,176(1) + ld 15,184(1) + ld 16,192(1) + ld 17,200(1) + ld 18,208(1) + ld 19,216(1) + ld 20,224(1) + ld 21,232(1) + ld 22,240(1) + ld 23,248(1) + ld 24,256(1) + ld 25,264(1) + ld 26,272(1) + ld 27,280(1) + ld 28,288(1) + ld 29,296(1) + ld 30,304(1) + ld 31,312(1) + mtlr 0 + addi 1,1,320 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + lwz 0,0(7) + lwz 3,0(31) + rotlwi 16,3,8 + rlwimi 16,3,24,0,7 + rlwimi 16,3,24,16,23 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,4(7) + add 15,15,3 + add 15,15,5 + + lwz 3,4(31) + rotlwi 17,3,8 + rlwimi 17,3,24,0,7 + rlwimi 17,3,24,16,23 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,8(7) + add 14,14,3 + add 14,14,5 + + lwz 3,8(31) + rotlwi 18,3,8 + rlwimi 18,3,24,0,7 + rlwimi 18,3,24,16,23 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,12(7) + add 6,6,3 + add 6,6,5 + + lwz 3,12(31) + rotlwi 19,3,8 + rlwimi 19,3,24,0,7 + rlwimi 19,3,24,16,23 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,16(7) + add 12,12,3 + add 12,12,5 + + lwz 3,16(31) + rotlwi 20,3,8 + rlwimi 20,3,24,0,7 + rlwimi 20,3,24,16,23 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,20(7) + add 11,11,3 + add 11,11,5 + + lwz 3,20(31) + rotlwi 21,3,8 + rlwimi 21,3,24,0,7 + rlwimi 21,3,24,16,23 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,24(7) + add 10,10,3 + add 10,10,5 + + lwz 3,24(31) + rotlwi 22,3,8 + rlwimi 22,3,24,0,7 + rlwimi 22,3,24,16,23 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,28(7) + add 9,9,3 + add 9,9,5 + + lwz 3,28(31) + rotlwi 23,3,8 + rlwimi 23,3,24,0,7 + rlwimi 23,3,24,16,23 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + lwz 0,32(7) + add 8,8,3 + add 8,8,5 + + lwz 3,32(31) + rotlwi 24,3,8 + rlwimi 24,3,24,0,7 + rlwimi 24,3,24,16,23 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + lwz 0,36(7) + add 15,15,3 + add 15,15,5 + + lwz 3,36(31) + rotlwi 25,3,8 + rlwimi 25,3,24,0,7 + rlwimi 25,3,24,16,23 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + lwz 0,40(7) + add 14,14,3 + add 14,14,5 + + lwz 3,40(31) + rotlwi 26,3,8 + rlwimi 26,3,24,0,7 + rlwimi 26,3,24,16,23 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + lwz 0,44(7) + add 6,6,3 + add 6,6,5 + + lwz 3,44(31) + rotlwi 27,3,8 + rlwimi 27,3,24,0,7 + rlwimi 27,3,24,16,23 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + lwz 0,48(7) + add 12,12,3 + add 12,12,5 + + lwz 3,48(31) + rotlwi 28,3,8 + rlwimi 28,3,24,0,7 + rlwimi 28,3,24,16,23 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + lwz 0,52(7) + add 11,11,3 + add 11,11,5 + + lwz 3,52(31) + rotlwi 29,3,8 + rlwimi 29,3,24,0,7 + rlwimi 29,3,24,16,23 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + lwz 0,56(7) + add 10,10,3 + add 10,10,5 + + lwz 3,56(31) + rotlwi 30,3,8 + rlwimi 30,3,24,0,7 + rlwimi 30,3,24,16,23 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + lwz 0,60(7) + add 9,9,3 + add 9,9,5 + + lwz 3,60(31) + rotlwi 31,3,8 + rlwimi 31,3,24,0,7 + rlwimi 31,3,24,16,23 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,3 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,64 + rotrwi 3,17,7 + rotrwi 4,17,18 + rotrwi 5,30,17 + rotrwi 0,30,19 + xor 3,3,4 + srwi 4,17,3 + xor 5,5,0 + srwi 0,30,10 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + lwz 0,0(7) + add 16,16,3 + add 16,16,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,18,7 + rotrwi 4,18,18 + rotrwi 5,31,17 + rotrwi 0,31,19 + xor 3,3,4 + srwi 4,18,3 + xor 5,5,0 + srwi 0,31,10 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + lwz 0,4(7) + add 17,17,3 + add 17,17,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,19,7 + rotrwi 4,19,18 + rotrwi 5,16,17 + rotrwi 0,16,19 + xor 3,3,4 + srwi 4,19,3 + xor 5,5,0 + srwi 0,16,10 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + lwz 0,8(7) + add 18,18,3 + add 18,18,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,20,7 + rotrwi 4,20,18 + rotrwi 5,17,17 + rotrwi 0,17,19 + xor 3,3,4 + srwi 4,20,3 + xor 5,5,0 + srwi 0,17,10 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + lwz 0,12(7) + add 19,19,3 + add 19,19,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,21,7 + rotrwi 4,21,18 + rotrwi 5,18,17 + rotrwi 0,18,19 + xor 3,3,4 + srwi 4,21,3 + xor 5,5,0 + srwi 0,18,10 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + lwz 0,16(7) + add 20,20,3 + add 20,20,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,22,7 + rotrwi 4,22,18 + rotrwi 5,19,17 + rotrwi 0,19,19 + xor 3,3,4 + srwi 4,22,3 + xor 5,5,0 + srwi 0,19,10 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + lwz 0,20(7) + add 21,21,3 + add 21,21,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,23,7 + rotrwi 4,23,18 + rotrwi 5,20,17 + rotrwi 0,20,19 + xor 3,3,4 + srwi 4,23,3 + xor 5,5,0 + srwi 0,20,10 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + lwz 0,24(7) + add 22,22,3 + add 22,22,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,24,7 + rotrwi 4,24,18 + rotrwi 5,21,17 + rotrwi 0,21,19 + xor 3,3,4 + srwi 4,24,3 + xor 5,5,0 + srwi 0,21,10 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + lwz 0,28(7) + add 23,23,3 + add 23,23,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrwi 3,25,7 + rotrwi 4,25,18 + rotrwi 5,22,17 + rotrwi 0,22,19 + xor 3,3,4 + srwi 4,25,3 + xor 5,5,0 + srwi 0,22,10 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + lwz 0,32(7) + add 24,24,3 + add 24,24,5 + rotrwi 3,12,6 + rotrwi 4,12,11 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrwi 4,4,14 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrwi 3,8,2 + rotrwi 4,8,13 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrwi 3,26,7 + rotrwi 4,26,18 + rotrwi 5,23,17 + rotrwi 0,23,19 + xor 3,3,4 + srwi 4,26,3 + xor 5,5,0 + srwi 0,23,10 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + lwz 0,36(7) + add 25,25,3 + add 25,25,5 + rotrwi 3,11,6 + rotrwi 4,11,11 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrwi 4,4,14 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrwi 3,15,2 + rotrwi 4,15,13 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrwi 3,27,7 + rotrwi 4,27,18 + rotrwi 5,24,17 + rotrwi 0,24,19 + xor 3,3,4 + srwi 4,27,3 + xor 5,5,0 + srwi 0,24,10 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + lwz 0,40(7) + add 26,26,3 + add 26,26,5 + rotrwi 3,10,6 + rotrwi 4,10,11 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrwi 4,4,14 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrwi 3,14,2 + rotrwi 4,14,13 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrwi 3,28,7 + rotrwi 4,28,18 + rotrwi 5,25,17 + rotrwi 0,25,19 + xor 3,3,4 + srwi 4,28,3 + xor 5,5,0 + srwi 0,25,10 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + lwz 0,44(7) + add 27,27,3 + add 27,27,5 + rotrwi 3,9,6 + rotrwi 4,9,11 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrwi 4,4,14 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrwi 3,6,2 + rotrwi 4,6,13 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrwi 3,29,7 + rotrwi 4,29,18 + rotrwi 5,26,17 + rotrwi 0,26,19 + xor 3,3,4 + srwi 4,29,3 + xor 5,5,0 + srwi 0,26,10 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + lwz 0,48(7) + add 28,28,3 + add 28,28,5 + rotrwi 3,8,6 + rotrwi 4,8,11 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrwi 4,4,14 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrwi 3,12,2 + rotrwi 4,12,13 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrwi 3,30,7 + rotrwi 4,30,18 + rotrwi 5,27,17 + rotrwi 0,27,19 + xor 3,3,4 + srwi 4,30,3 + xor 5,5,0 + srwi 0,27,10 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + lwz 0,52(7) + add 29,29,3 + add 29,29,5 + rotrwi 3,15,6 + rotrwi 4,15,11 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrwi 4,4,14 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrwi 3,11,2 + rotrwi 4,11,13 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrwi 3,31,7 + rotrwi 4,31,18 + rotrwi 5,28,17 + rotrwi 0,28,19 + xor 3,3,4 + srwi 4,31,3 + xor 5,5,0 + srwi 0,28,10 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + lwz 0,56(7) + add 30,30,3 + add 30,30,5 + rotrwi 3,14,6 + rotrwi 4,14,11 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrwi 4,4,14 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrwi 3,10,2 + rotrwi 4,10,13 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrwi 3,16,7 + rotrwi 4,16,18 + rotrwi 5,29,17 + rotrwi 0,29,19 + xor 3,3,4 + srwi 4,16,3 + xor 5,5,0 + srwi 0,29,10 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + lwz 0,60(7) + add 31,31,3 + add 31,31,5 + rotrwi 3,6,6 + rotrwi 4,6,11 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrwi 4,4,14 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrwi 3,9,2 + rotrwi 4,9,13 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrwi 4,4,9 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,144(1) + ld 31,136(1) + ld 5,128(1) + subi 7,7,192 + + lwz 16,0(3) + lwz 17,4(3) + lwz 18,8(3) + lwz 19,12(3) + lwz 20,16(3) + lwz 21,20(3) + lwz 22,24(3) + addi 31,31,64 + lwz 23,28(3) + add 8,8,16 + add 9,9,17 + std 31,136(1) + add 10,10,18 + stw 8,0(3) + add 11,11,19 + stw 9,4(3) + add 12,12,20 + stw 10,8(3) + add 6,6,21 + stw 11,12(3) + add 14,14,22 + stw 12,16(3) + add 15,15,23 + stw 6,20(3) + stw 14,24(3) + cmpld 31,5 + stw 15,28(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size zfs_sha256_ppc,.-zfs_sha256_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +#endif diff --git a/module/icp/asm-ppc64/sha2/sha512-p8.S b/module/icp/asm-ppc64/sha2/sha512-p8.S new file mode 100644 index 000000000000..abc8591a8a2e --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha512-p8.S @@ -0,0 +1,1699 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha512_power8 +.type zfs_sha512_power8,@function +.section ".opd","aw" +.align 3 +zfs_sha512_power8: +.quad .zfs_sha512_power8,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha512_power8: + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + mfspr 12,256 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + mtspr 256,11 + + bl .LPICmeup + addi 11,1,79 + .long 0x7C001E99 + .long 0x7C4A1E99 + .long 0x7C9A1E99 + vsldoi 1,0,0,8 + .long 0x7CDB1E99 + vsldoi 3,2,2,8 + vsldoi 5,4,4,8 + vsldoi 7,6,6,8 + li 0,4 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + .long 0x10E7E0C0 + lvx 28,10,6 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7D402699 + addi 4,4,16 + vsldoi 9,8,8,8 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,8 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7DC02699 + addi 4,4,16 + vsldoi 13,12,12,8 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,8 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7E402699 + addi 4,4,16 + vsldoi 17,16,16,8 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,8 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7F402699 + addi 4,4,16 + vsldoi 25,24,24,8 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + vsldoi 27,26,26,8 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA06C2 + .long 0x1129F0C0 + .long 0x13DB7EC2 + .long 0x1129F0C0 + .long 0x112990C0 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13CB06C2 + .long 0x114AF0C0 + .long 0x13C87EC2 + .long 0x114AF0C0 + .long 0x114A98C0 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13CC06C2 + .long 0x116BF0C0 + .long 0x13C97EC2 + .long 0x116BF0C0 + .long 0x116BC0C0 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13CD06C2 + .long 0x118CF0C0 + .long 0x13CA7EC2 + .long 0x118CF0C0 + .long 0x118CC8C0 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13CE06C2 + .long 0x11ADF0C0 + .long 0x13CB7EC2 + .long 0x11ADF0C0 + .long 0x11ADD0C0 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13CF06C2 + .long 0x11CEF0C0 + .long 0x13CC7EC2 + .long 0x11CEF0C0 + .long 0x11CED8C0 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D006C2 + .long 0x11EFF0C0 + .long 0x13CD7EC2 + .long 0x11EFF0C0 + .long 0x11EF40C0 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13D106C2 + .long 0x1210F0C0 + .long 0x13CE7EC2 + .long 0x1210F0C0 + .long 0x121048C0 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x13D206C2 + .long 0x1231F0C0 + .long 0x13CF7EC2 + .long 0x1231F0C0 + .long 0x123150C0 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13D306C2 + .long 0x1252F0C0 + .long 0x13D07EC2 + .long 0x1252F0C0 + .long 0x125258C0 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13D806C2 + .long 0x1273F0C0 + .long 0x13D17EC2 + .long 0x1273F0C0 + .long 0x127360C0 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13D906C2 + .long 0x1318F0C0 + .long 0x13D27EC2 + .long 0x1318F0C0 + .long 0x131868C0 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13DA06C2 + .long 0x1339F0C0 + .long 0x13D37EC2 + .long 0x1339F0C0 + .long 0x133970C0 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13DB06C2 + .long 0x135AF0C0 + .long 0x13D87EC2 + .long 0x135AF0C0 + .long 0x135A78C0 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C806C2 + .long 0x137BF0C0 + .long 0x13D97EC2 + .long 0x137BF0C0 + .long 0x137B80C0 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + .long 0x100050C0 + lvx 12,26,11 + .long 0x102158C0 + lvx 13,27,11 + .long 0x104260C0 + lvx 14,28,11 + .long 0x106368C0 + lvx 15,29,11 + .long 0x108470C0 + lvx 16,30,11 + .long 0x10A578C0 + lvx 17,31,11 + .long 0x10C680C0 + .long 0x10E788C0 + bne .Loop + vperm 0,0,1,28 + vperm 2,2,3,28 + vperm 4,4,5,28 + vperm 6,6,7,28 + .long 0x7C001F99 + .long 0x7C4A1F99 + .long 0x7C9A1F99 + .long 0x7CDB1F99 + addi 11,1,207 + mtlr 8 + mtspr 256,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size .zfs_sha512_power8,.-.zfs_sha512_power8 +.size zfs_sha512_power8,.-.zfs_sha512_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0xd728ae22 +.long 0x428a2f98,0xd728ae22 +.long 0x71374491,0x23ef65cd +.long 0x71374491,0x23ef65cd +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xe9b5dba5,0x8189dbbc +.long 0xe9b5dba5,0x8189dbbc +.long 0x3956c25b,0xf348b538 +.long 0x3956c25b,0xf348b538 +.long 0x59f111f1,0xb605d019 +.long 0x59f111f1,0xb605d019 +.long 0x923f82a4,0xaf194f9b +.long 0x923f82a4,0xaf194f9b +.long 0xab1c5ed5,0xda6d8118 +.long 0xab1c5ed5,0xda6d8118 +.long 0xd807aa98,0xa3030242 +.long 0xd807aa98,0xa3030242 +.long 0x12835b01,0x45706fbe +.long 0x12835b01,0x45706fbe +.long 0x243185be,0x4ee4b28c +.long 0x243185be,0x4ee4b28c +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x72be5d74,0xf27b896f +.long 0x72be5d74,0xf27b896f +.long 0x80deb1fe,0x3b1696b1 +.long 0x80deb1fe,0x3b1696b1 +.long 0x9bdc06a7,0x25c71235 +.long 0x9bdc06a7,0x25c71235 +.long 0xc19bf174,0xcf692694 +.long 0xc19bf174,0xcf692694 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xefbe4786,0x384f25e3 +.long 0xefbe4786,0x384f25e3 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x240ca1cc,0x77ac9c65 +.long 0x240ca1cc,0x77ac9c65 +.long 0x2de92c6f,0x592b0275 +.long 0x2de92c6f,0x592b0275 +.long 0x4a7484aa,0x6ea6e483 +.long 0x4a7484aa,0x6ea6e483 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x76f988da,0x831153b5 +.long 0x76f988da,0x831153b5 +.long 0x983e5152,0xee66dfab +.long 0x983e5152,0xee66dfab +.long 0xa831c66d,0x2db43210 +.long 0xa831c66d,0x2db43210 +.long 0xb00327c8,0x98fb213f +.long 0xb00327c8,0x98fb213f +.long 0xbf597fc7,0xbeef0ee4 +.long 0xbf597fc7,0xbeef0ee4 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xd5a79147,0x930aa725 +.long 0xd5a79147,0x930aa725 +.long 0x06ca6351,0xe003826f +.long 0x06ca6351,0xe003826f +.long 0x14292967,0x0a0e6e70 +.long 0x14292967,0x0a0e6e70 +.long 0x27b70a85,0x46d22ffc +.long 0x27b70a85,0x46d22ffc +.long 0x2e1b2138,0x5c26c926 +.long 0x2e1b2138,0x5c26c926 +.long 0x4d2c6dfc,0x5ac42aed +.long 0x4d2c6dfc,0x5ac42aed +.long 0x53380d13,0x9d95b3df +.long 0x53380d13,0x9d95b3df +.long 0x650a7354,0x8baf63de +.long 0x650a7354,0x8baf63de +.long 0x766a0abb,0x3c77b2a8 +.long 0x766a0abb,0x3c77b2a8 +.long 0x81c2c92e,0x47edaee6 +.long 0x81c2c92e,0x47edaee6 +.long 0x92722c85,0x1482353b +.long 0x92722c85,0x1482353b +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa81a664b,0xbc423001 +.long 0xa81a664b,0xbc423001 +.long 0xc24b8b70,0xd0f89791 +.long 0xc24b8b70,0xd0f89791 +.long 0xc76c51a3,0x0654be30 +.long 0xc76c51a3,0x0654be30 +.long 0xd192e819,0xd6ef5218 +.long 0xd192e819,0xd6ef5218 +.long 0xd6990624,0x5565a910 +.long 0xd6990624,0x5565a910 +.long 0xf40e3585,0x5771202a +.long 0xf40e3585,0x5771202a +.long 0x106aa070,0x32bbd1b8 +.long 0x106aa070,0x32bbd1b8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x1e376c08,0x5141ab53 +.long 0x1e376c08,0x5141ab53 +.long 0x2748774c,0xdf8eeb99 +.long 0x2748774c,0xdf8eeb99 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x391c0cb3,0xc5c95a63 +.long 0x391c0cb3,0xc5c95a63 +.long 0x4ed8aa4a,0xe3418acb +.long 0x4ed8aa4a,0xe3418acb +.long 0x5b9cca4f,0x7763e373 +.long 0x5b9cca4f,0x7763e373 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x748f82ee,0x5defb2fc +.long 0x748f82ee,0x5defb2fc +.long 0x78a5636f,0x43172f60 +.long 0x78a5636f,0x43172f60 +.long 0x84c87814,0xa1f0ab72 +.long 0x84c87814,0xa1f0ab72 +.long 0x8cc70208,0x1a6439ec +.long 0x8cc70208,0x1a6439ec +.long 0x90befffa,0x23631e28 +.long 0x90befffa,0x23631e28 +.long 0xa4506ceb,0xde82bde9 +.long 0xa4506ceb,0xde82bde9 +.long 0xbef9a3f7,0xb2c67915 +.long 0xbef9a3f7,0xb2c67915 +.long 0xc67178f2,0xe372532b +.long 0xc67178f2,0xe372532b +.long 0xca273ece,0xea26619c +.long 0xca273ece,0xea26619c +.long 0xd186b8c7,0x21c0c207 +.long 0xd186b8c7,0x21c0c207 +.long 0xeada7dd6,0xcde0eb1e +.long 0xeada7dd6,0xcde0eb1e +.long 0xf57d4f7f,0xee6ed178 +.long 0xf57d4f7f,0xee6ed178 +.long 0x06f067aa,0x72176fba +.long 0x06f067aa,0x72176fba +.long 0x0a637dc5,0xa2c898a6 +.long 0x0a637dc5,0xa2c898a6 +.long 0x113f9804,0xbef90dae +.long 0x113f9804,0xbef90dae +.long 0x1b710b35,0x131c471b +.long 0x1b710b35,0x131c471b +.long 0x28db77f5,0x23047d84 +.long 0x28db77f5,0x23047d84 +.long 0x32caab7b,0x40c72493 +.long 0x32caab7b,0x40c72493 +.long 0x3c9ebe0a,0x15c9bebc +.long 0x3c9ebe0a,0x15c9bebc +.long 0x431d67c4,0x9c100d4c +.long 0x431d67c4,0x9c100d4c +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x597f299c,0xfc657e2a +.long 0x597f299c,0xfc657e2a +.long 0x5fcb6fab,0x3ad6faec +.long 0x5fcb6fab,0x3ad6faec +.long 0x6c44198c,0x4a475817 +.long 0x6c44198c,0x4a475817 +.long 0,0 +.long 0,0 +.long 0x00010203,0x04050607 +.long 0x10111213,0x14151617 +.byte 83,72,65,53,49,50,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha512_power8 +.type zfs_sha512_power8,@function +.align 6 +zfs_sha512_power8: +.localentry zfs_sha512_power8,0 + + stdu 1,-384(1) + mflr 8 + li 10,207 + li 11,223 + stvx 24,10,1 + addi 10,10,32 + li 12,-1 + stvx 25,11,1 + addi 11,11,32 + stvx 26,10,1 + addi 10,10,32 + stvx 27,11,1 + addi 11,11,32 + stvx 28,10,1 + addi 10,10,32 + stvx 29,11,1 + addi 11,11,32 + stvx 30,10,1 + stvx 31,11,1 + li 11,-4096+255 + stw 12,332(1) + li 10,0x10 + std 26,336(1) + li 26,0x20 + std 27,344(1) + li 27,0x30 + std 28,352(1) + li 28,0x40 + std 29,360(1) + li 29,0x50 + std 30,368(1) + li 30,0x60 + std 31,376(1) + li 31,0x70 + std 8,400(1) + or 11,11,11 + + bl .LPICmeup + addi 11,1,79 + li 7,8 + lvsl 31,0,7 + vspltisb 28,0x0f + vxor 31,31,28 + .long 0x7C001E99 + .long 0x7C4A1E99 + .long 0x7C9A1E99 + vsldoi 1,0,0,8 + .long 0x7CDB1E99 + vsldoi 3,2,2,8 + vsldoi 5,4,4,8 + vsldoi 7,6,6,8 + li 0,4 + b .Loop +.align 5 +.Loop: + lvx 28,0,6 + .long 0x7D002699 + addi 4,4,16 + mr 7,6 + stvx 0,0,11 + stvx 1,10,11 + stvx 2,26,11 + stvx 3,27,11 + stvx 4,28,11 + stvx 5,29,11 + stvx 6,30,11 + stvx 7,31,11 + .long 0x10E7E0C0 + lvx 28,10,6 + vperm 8,8,8,31 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7D402699 + addi 4,4,16 + vsldoi 9,8,8,8 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + vperm 10,10,10,31 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7D802699 + addi 4,4,16 + vsldoi 11,10,10,8 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + vperm 12,12,12,31 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7DC02699 + addi 4,4,16 + vsldoi 13,12,12,8 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + vperm 14,14,14,31 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x7E002699 + addi 4,4,16 + vsldoi 15,14,14,8 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + vperm 16,16,16,31 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x7E402699 + addi 4,4,16 + vsldoi 17,16,16,8 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + vperm 18,18,18,31 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x7F002699 + addi 4,4,16 + vsldoi 19,18,18,8 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + vperm 24,24,24,31 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x7F402699 + addi 4,4,16 + vsldoi 25,24,24,8 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + vperm 26,26,26,31 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + vsldoi 27,26,26,8 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + mtctr 0 + b .L16_xx +.align 5 +.L16_xx: + .long 0x13CA06C2 + .long 0x1129F0C0 + .long 0x13DB7EC2 + .long 0x1129F0C0 + .long 0x112990C0 + .long 0x10E740C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13CB06C2 + .long 0x114AF0C0 + .long 0x13C87EC2 + .long 0x114AF0C0 + .long 0x114A98C0 + .long 0x10C648C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13CC06C2 + .long 0x116BF0C0 + .long 0x13C97EC2 + .long 0x116BF0C0 + .long 0x116BC0C0 + .long 0x10A550C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13CD06C2 + .long 0x118CF0C0 + .long 0x13CA7EC2 + .long 0x118CF0C0 + .long 0x118CC8C0 + .long 0x108458C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13CE06C2 + .long 0x11ADF0C0 + .long 0x13CB7EC2 + .long 0x11ADF0C0 + .long 0x11ADD0C0 + .long 0x106360C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13CF06C2 + .long 0x11CEF0C0 + .long 0x13CC7EC2 + .long 0x11CEF0C0 + .long 0x11CED8C0 + .long 0x104268C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13D006C2 + .long 0x11EFF0C0 + .long 0x13CD7EC2 + .long 0x11EFF0C0 + .long 0x11EF40C0 + .long 0x102170C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13D106C2 + .long 0x1210F0C0 + .long 0x13CE7EC2 + .long 0x1210F0C0 + .long 0x121048C0 + .long 0x100078C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + .long 0x13D206C2 + .long 0x1231F0C0 + .long 0x13CF7EC2 + .long 0x1231F0C0 + .long 0x123150C0 + .long 0x10E780C0 + vsel 29,6,5,4 + .long 0x10C6E0C0 + .long 0x10E7E8C0 + .long 0x13C4FEC2 + .long 0x10E7F0C0 + vxor 29,0,1 + vsel 29,1,2,29 + .long 0x106338C0 + .long 0x13C086C2 + .long 0x13DEE8C0 + .long 0x10E7F0C0 + lvx 28,26,7 + .long 0x13D306C2 + .long 0x1252F0C0 + .long 0x13D07EC2 + .long 0x1252F0C0 + .long 0x125258C0 + .long 0x10C688C0 + vsel 29,5,4,3 + .long 0x10A5E0C0 + .long 0x10C6E8C0 + .long 0x13C3FEC2 + .long 0x10C6F0C0 + vxor 29,7,0 + vsel 29,0,1,29 + .long 0x104230C0 + .long 0x13C786C2 + .long 0x13DEE8C0 + .long 0x10C6F0C0 + lvx 28,27,7 + .long 0x13D806C2 + .long 0x1273F0C0 + .long 0x13D17EC2 + .long 0x1273F0C0 + .long 0x127360C0 + .long 0x10A590C0 + vsel 29,4,3,2 + .long 0x1084E0C0 + .long 0x10A5E8C0 + .long 0x13C2FEC2 + .long 0x10A5F0C0 + vxor 29,6,7 + vsel 29,7,0,29 + .long 0x102128C0 + .long 0x13C686C2 + .long 0x13DEE8C0 + .long 0x10A5F0C0 + lvx 28,28,7 + .long 0x13D906C2 + .long 0x1318F0C0 + .long 0x13D27EC2 + .long 0x1318F0C0 + .long 0x131868C0 + .long 0x108498C0 + vsel 29,3,2,1 + .long 0x1063E0C0 + .long 0x1084E8C0 + .long 0x13C1FEC2 + .long 0x1084F0C0 + vxor 29,5,6 + vsel 29,6,7,29 + .long 0x100020C0 + .long 0x13C586C2 + .long 0x13DEE8C0 + .long 0x1084F0C0 + lvx 28,29,7 + .long 0x13DA06C2 + .long 0x1339F0C0 + .long 0x13D37EC2 + .long 0x1339F0C0 + .long 0x133970C0 + .long 0x1063C0C0 + vsel 29,2,1,0 + .long 0x1042E0C0 + .long 0x1063E8C0 + .long 0x13C0FEC2 + .long 0x1063F0C0 + vxor 29,4,5 + vsel 29,5,6,29 + .long 0x10E718C0 + .long 0x13C486C2 + .long 0x13DEE8C0 + .long 0x1063F0C0 + lvx 28,30,7 + .long 0x13DB06C2 + .long 0x135AF0C0 + .long 0x13D87EC2 + .long 0x135AF0C0 + .long 0x135A78C0 + .long 0x1042C8C0 + vsel 29,1,0,7 + .long 0x1021E0C0 + .long 0x1042E8C0 + .long 0x13C7FEC2 + .long 0x1042F0C0 + vxor 29,3,4 + vsel 29,4,5,29 + .long 0x10C610C0 + .long 0x13C386C2 + .long 0x13DEE8C0 + .long 0x1042F0C0 + lvx 28,31,7 + addi 7,7,0x80 + .long 0x13C806C2 + .long 0x137BF0C0 + .long 0x13D97EC2 + .long 0x137BF0C0 + .long 0x137B80C0 + .long 0x1021D0C0 + vsel 29,0,7,6 + .long 0x1000E0C0 + .long 0x1021E8C0 + .long 0x13C6FEC2 + .long 0x1021F0C0 + vxor 29,2,3 + vsel 29,3,4,29 + .long 0x10A508C0 + .long 0x13C286C2 + .long 0x13DEE8C0 + .long 0x1021F0C0 + lvx 28,0,7 + .long 0x13C906C2 + .long 0x1108F0C0 + .long 0x13DA7EC2 + .long 0x1108F0C0 + .long 0x110888C0 + .long 0x1000D8C0 + vsel 29,7,6,5 + .long 0x10E7E0C0 + .long 0x1000E8C0 + .long 0x13C5FEC2 + .long 0x1000F0C0 + vxor 29,1,2 + vsel 29,2,3,29 + .long 0x108400C0 + .long 0x13C186C2 + .long 0x13DEE8C0 + .long 0x1000F0C0 + lvx 28,10,7 + bdnz .L16_xx + + lvx 10,0,11 + subic. 5,5,1 + lvx 11,10,11 + .long 0x100050C0 + lvx 12,26,11 + .long 0x102158C0 + lvx 13,27,11 + .long 0x104260C0 + lvx 14,28,11 + .long 0x106368C0 + lvx 15,29,11 + .long 0x108470C0 + lvx 16,30,11 + .long 0x10A578C0 + lvx 17,31,11 + .long 0x10C680C0 + .long 0x10E788C0 + bne .Loop + vperm 0,0,1,28 + vperm 2,2,3,28 + vperm 4,4,5,28 + vperm 6,6,7,28 + .long 0x7C001F99 + .long 0x7C4A1F99 + .long 0x7C9A1F99 + .long 0x7CDB1F99 + addi 11,1,207 + mtlr 8 + or 12,12,12 + lvx 24,0,11 + lvx 25,10,11 + lvx 26,26,11 + lvx 27,27,11 + lvx 28,28,11 + lvx 29,29,11 + lvx 30,30,11 + lvx 31,31,11 + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,6,3,0 +.long 0 +.size zfs_sha512_power8,.-zfs_sha512_power8 +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 6 + addi 6,6,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0xd728ae22,0x428a2f98 +.long 0xd728ae22,0x428a2f98 +.long 0x23ef65cd,0x71374491 +.long 0x23ef65cd,0x71374491 +.long 0xec4d3b2f,0xb5c0fbcf +.long 0xec4d3b2f,0xb5c0fbcf +.long 0x8189dbbc,0xe9b5dba5 +.long 0x8189dbbc,0xe9b5dba5 +.long 0xf348b538,0x3956c25b +.long 0xf348b538,0x3956c25b +.long 0xb605d019,0x59f111f1 +.long 0xb605d019,0x59f111f1 +.long 0xaf194f9b,0x923f82a4 +.long 0xaf194f9b,0x923f82a4 +.long 0xda6d8118,0xab1c5ed5 +.long 0xda6d8118,0xab1c5ed5 +.long 0xa3030242,0xd807aa98 +.long 0xa3030242,0xd807aa98 +.long 0x45706fbe,0x12835b01 +.long 0x45706fbe,0x12835b01 +.long 0x4ee4b28c,0x243185be +.long 0x4ee4b28c,0x243185be +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xf27b896f,0x72be5d74 +.long 0xf27b896f,0x72be5d74 +.long 0x3b1696b1,0x80deb1fe +.long 0x3b1696b1,0x80deb1fe +.long 0x25c71235,0x9bdc06a7 +.long 0x25c71235,0x9bdc06a7 +.long 0xcf692694,0xc19bf174 +.long 0xcf692694,0xc19bf174 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x384f25e3,0xefbe4786 +.long 0x384f25e3,0xefbe4786 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x77ac9c65,0x240ca1cc +.long 0x77ac9c65,0x240ca1cc +.long 0x592b0275,0x2de92c6f +.long 0x592b0275,0x2de92c6f +.long 0x6ea6e483,0x4a7484aa +.long 0x6ea6e483,0x4a7484aa +.long 0xbd41fbd4,0x5cb0a9dc +.long 0xbd41fbd4,0x5cb0a9dc +.long 0x831153b5,0x76f988da +.long 0x831153b5,0x76f988da +.long 0xee66dfab,0x983e5152 +.long 0xee66dfab,0x983e5152 +.long 0x2db43210,0xa831c66d +.long 0x2db43210,0xa831c66d +.long 0x98fb213f,0xb00327c8 +.long 0x98fb213f,0xb00327c8 +.long 0xbeef0ee4,0xbf597fc7 +.long 0xbeef0ee4,0xbf597fc7 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x930aa725,0xd5a79147 +.long 0x930aa725,0xd5a79147 +.long 0xe003826f,0x06ca6351 +.long 0xe003826f,0x06ca6351 +.long 0x0a0e6e70,0x14292967 +.long 0x0a0e6e70,0x14292967 +.long 0x46d22ffc,0x27b70a85 +.long 0x46d22ffc,0x27b70a85 +.long 0x5c26c926,0x2e1b2138 +.long 0x5c26c926,0x2e1b2138 +.long 0x5ac42aed,0x4d2c6dfc +.long 0x5ac42aed,0x4d2c6dfc +.long 0x9d95b3df,0x53380d13 +.long 0x9d95b3df,0x53380d13 +.long 0x8baf63de,0x650a7354 +.long 0x8baf63de,0x650a7354 +.long 0x3c77b2a8,0x766a0abb +.long 0x3c77b2a8,0x766a0abb +.long 0x47edaee6,0x81c2c92e +.long 0x47edaee6,0x81c2c92e +.long 0x1482353b,0x92722c85 +.long 0x1482353b,0x92722c85 +.long 0x4cf10364,0xa2bfe8a1 +.long 0x4cf10364,0xa2bfe8a1 +.long 0xbc423001,0xa81a664b +.long 0xbc423001,0xa81a664b +.long 0xd0f89791,0xc24b8b70 +.long 0xd0f89791,0xc24b8b70 +.long 0x0654be30,0xc76c51a3 +.long 0x0654be30,0xc76c51a3 +.long 0xd6ef5218,0xd192e819 +.long 0xd6ef5218,0xd192e819 +.long 0x5565a910,0xd6990624 +.long 0x5565a910,0xd6990624 +.long 0x5771202a,0xf40e3585 +.long 0x5771202a,0xf40e3585 +.long 0x32bbd1b8,0x106aa070 +.long 0x32bbd1b8,0x106aa070 +.long 0xb8d2d0c8,0x19a4c116 +.long 0xb8d2d0c8,0x19a4c116 +.long 0x5141ab53,0x1e376c08 +.long 0x5141ab53,0x1e376c08 +.long 0xdf8eeb99,0x2748774c +.long 0xdf8eeb99,0x2748774c +.long 0xe19b48a8,0x34b0bcb5 +.long 0xe19b48a8,0x34b0bcb5 +.long 0xc5c95a63,0x391c0cb3 +.long 0xc5c95a63,0x391c0cb3 +.long 0xe3418acb,0x4ed8aa4a +.long 0xe3418acb,0x4ed8aa4a +.long 0x7763e373,0x5b9cca4f +.long 0x7763e373,0x5b9cca4f +.long 0xd6b2b8a3,0x682e6ff3 +.long 0xd6b2b8a3,0x682e6ff3 +.long 0x5defb2fc,0x748f82ee +.long 0x5defb2fc,0x748f82ee +.long 0x43172f60,0x78a5636f +.long 0x43172f60,0x78a5636f +.long 0xa1f0ab72,0x84c87814 +.long 0xa1f0ab72,0x84c87814 +.long 0x1a6439ec,0x8cc70208 +.long 0x1a6439ec,0x8cc70208 +.long 0x23631e28,0x90befffa +.long 0x23631e28,0x90befffa +.long 0xde82bde9,0xa4506ceb +.long 0xde82bde9,0xa4506ceb +.long 0xb2c67915,0xbef9a3f7 +.long 0xb2c67915,0xbef9a3f7 +.long 0xe372532b,0xc67178f2 +.long 0xe372532b,0xc67178f2 +.long 0xea26619c,0xca273ece +.long 0xea26619c,0xca273ece +.long 0x21c0c207,0xd186b8c7 +.long 0x21c0c207,0xd186b8c7 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xee6ed178,0xf57d4f7f +.long 0xee6ed178,0xf57d4f7f +.long 0x72176fba,0x06f067aa +.long 0x72176fba,0x06f067aa +.long 0xa2c898a6,0x0a637dc5 +.long 0xa2c898a6,0x0a637dc5 +.long 0xbef90dae,0x113f9804 +.long 0xbef90dae,0x113f9804 +.long 0x131c471b,0x1b710b35 +.long 0x131c471b,0x1b710b35 +.long 0x23047d84,0x28db77f5 +.long 0x23047d84,0x28db77f5 +.long 0x40c72493,0x32caab7b +.long 0x40c72493,0x32caab7b +.long 0x15c9bebc,0x3c9ebe0a +.long 0x15c9bebc,0x3c9ebe0a +.long 0x9c100d4c,0x431d67c4 +.long 0x9c100d4c,0x431d67c4 +.long 0xcb3e42b6,0x4cc5d4be +.long 0xcb3e42b6,0x4cc5d4be +.long 0xfc657e2a,0x597f299c +.long 0xfc657e2a,0x597f299c +.long 0x3ad6faec,0x5fcb6fab +.long 0x3ad6faec,0x5fcb6fab +.long 0x4a475817,0x6c44198c +.long 0x4a475817,0x6c44198c +.long 0,0 +.long 0,0 +.long 0x14151617,0x10111213 +.long 0x04050607,0x00010203 +.byte 83,72,65,53,49,50,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 + +#endif diff --git a/module/icp/asm-ppc64/sha2/sha512-ppc.S b/module/icp/asm-ppc64/sha2/sha512-ppc.S new file mode 100644 index 000000000000..6ff4e5702ed3 --- /dev/null +++ b/module/icp/asm-ppc64/sha2/sha512-ppc.S @@ -0,0 +1,2945 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) + +.text + +.globl zfs_sha512_ppc +.type zfs_sha512_ppc,@function +.section ".opd","aw" +.align 3 +zfs_sha512_ppc: +.quad .zfs_sha512_ppc,.TOC.@tocbase,0 +.previous +.align 6 +.zfs_sha512_ppc: + stdu 1,-384(1) + mflr 0 + sldi 5,5,7 + + std 3,208(1) + + std 14,240(1) + std 15,248(1) + std 16,256(1) + std 17,264(1) + std 18,272(1) + std 19,280(1) + std 20,288(1) + std 21,296(1) + std 22,304(1) + std 23,312(1) + std 24,320(1) + std 25,328(1) + std 26,336(1) + std 27,344(1) + std 28,352(1) + std 29,360(1) + std 30,368(1) + std 31,376(1) + std 0,400(1) + ld 8,0(3) + mr 31,4 + ld 9,8(3) + ld 10,16(3) + ld 11,24(3) + ld 12,32(3) + ld 6,40(3) + ld 14,48(3) + ld 15,56(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,192(1) + std 31,200(1) + bl .Lsha2_block_private + b .Ldone + + + + + + + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,3968 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + + ld 5,184(1) +.Lcross_page: + li 0,32 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,176(1) + addi 0,1,176 + addi 31,1,48 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + ld 31,176(1) + ld 5,184(1) + addic. 5,5,-128 + bne .Lunaligned + +.Ldone: + ld 0,400(1) + ld 14,240(1) + ld 15,248(1) + ld 16,256(1) + ld 17,264(1) + ld 18,272(1) + ld 19,280(1) + ld 20,288(1) + ld 21,296(1) + ld 22,304(1) + ld 23,312(1) + ld 24,320(1) + ld 25,328(1) + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + mtlr 0 + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + ld 0,0(7) + lwz 5,0(31) + lwz 16,4(31) + insrdi 16,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,8(7) + add 15,15,3 + add 15,15,5 + + lwz 5,8(31) + lwz 17,12(31) + insrdi 17,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,16(7) + add 14,14,3 + add 14,14,5 + + lwz 5,16(31) + lwz 18,20(31) + insrdi 18,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,24(7) + add 6,6,3 + add 6,6,5 + + lwz 5,24(31) + lwz 19,28(31) + insrdi 19,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,32(7) + add 12,12,3 + add 12,12,5 + + lwz 5,32(31) + lwz 20,36(31) + insrdi 20,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,40(7) + add 11,11,3 + add 11,11,5 + + lwz 5,40(31) + lwz 21,44(31) + insrdi 21,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,48(7) + add 10,10,3 + add 10,10,5 + + lwz 5,48(31) + lwz 22,52(31) + insrdi 22,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,56(7) + add 9,9,3 + add 9,9,5 + + lwz 5,56(31) + lwz 23,60(31) + insrdi 23,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + ld 0,64(7) + add 8,8,3 + add 8,8,5 + + lwz 5,64(31) + lwz 24,68(31) + insrdi 24,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,72(7) + add 15,15,3 + add 15,15,5 + + lwz 5,72(31) + lwz 25,76(31) + insrdi 25,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,80(7) + add 14,14,3 + add 14,14,5 + + lwz 5,80(31) + lwz 26,84(31) + insrdi 26,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,88(7) + add 6,6,3 + add 6,6,5 + + lwz 5,88(31) + lwz 27,92(31) + insrdi 27,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,96(7) + add 12,12,3 + add 12,12,5 + + lwz 5,96(31) + lwz 28,100(31) + insrdi 28,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,104(7) + add 11,11,3 + add 11,11,5 + + lwz 5,104(31) + lwz 29,108(31) + insrdi 29,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,112(7) + add 10,10,3 + add 10,10,5 + + lwz 5,112(31) + lwz 30,116(31) + insrdi 30,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,120(7) + add 9,9,3 + add 9,9,5 + + lwz 5,120(31) + lwz 31,124(31) + insrdi 31,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,4 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,128 + rotrdi 3,17,1 + rotrdi 4,17,8 + rotrdi 5,30,19 + rotrdi 0,30,61 + xor 3,3,4 + srdi 4,17,7 + xor 5,5,0 + srdi 0,30,6 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + ld 0,0(7) + add 16,16,3 + add 16,16,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,18,1 + rotrdi 4,18,8 + rotrdi 5,31,19 + rotrdi 0,31,61 + xor 3,3,4 + srdi 4,18,7 + xor 5,5,0 + srdi 0,31,6 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + ld 0,8(7) + add 17,17,3 + add 17,17,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,19,1 + rotrdi 4,19,8 + rotrdi 5,16,19 + rotrdi 0,16,61 + xor 3,3,4 + srdi 4,19,7 + xor 5,5,0 + srdi 0,16,6 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + ld 0,16(7) + add 18,18,3 + add 18,18,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,20,1 + rotrdi 4,20,8 + rotrdi 5,17,19 + rotrdi 0,17,61 + xor 3,3,4 + srdi 4,20,7 + xor 5,5,0 + srdi 0,17,6 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + ld 0,24(7) + add 19,19,3 + add 19,19,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,21,1 + rotrdi 4,21,8 + rotrdi 5,18,19 + rotrdi 0,18,61 + xor 3,3,4 + srdi 4,21,7 + xor 5,5,0 + srdi 0,18,6 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + ld 0,32(7) + add 20,20,3 + add 20,20,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,22,1 + rotrdi 4,22,8 + rotrdi 5,19,19 + rotrdi 0,19,61 + xor 3,3,4 + srdi 4,22,7 + xor 5,5,0 + srdi 0,19,6 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + ld 0,40(7) + add 21,21,3 + add 21,21,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,23,1 + rotrdi 4,23,8 + rotrdi 5,20,19 + rotrdi 0,20,61 + xor 3,3,4 + srdi 4,23,7 + xor 5,5,0 + srdi 0,20,6 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + ld 0,48(7) + add 22,22,3 + add 22,22,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,24,1 + rotrdi 4,24,8 + rotrdi 5,21,19 + rotrdi 0,21,61 + xor 3,3,4 + srdi 4,24,7 + xor 5,5,0 + srdi 0,21,6 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + ld 0,56(7) + add 23,23,3 + add 23,23,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrdi 3,25,1 + rotrdi 4,25,8 + rotrdi 5,22,19 + rotrdi 0,22,61 + xor 3,3,4 + srdi 4,25,7 + xor 5,5,0 + srdi 0,22,6 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + ld 0,64(7) + add 24,24,3 + add 24,24,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,26,1 + rotrdi 4,26,8 + rotrdi 5,23,19 + rotrdi 0,23,61 + xor 3,3,4 + srdi 4,26,7 + xor 5,5,0 + srdi 0,23,6 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + ld 0,72(7) + add 25,25,3 + add 25,25,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,27,1 + rotrdi 4,27,8 + rotrdi 5,24,19 + rotrdi 0,24,61 + xor 3,3,4 + srdi 4,27,7 + xor 5,5,0 + srdi 0,24,6 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + ld 0,80(7) + add 26,26,3 + add 26,26,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,28,1 + rotrdi 4,28,8 + rotrdi 5,25,19 + rotrdi 0,25,61 + xor 3,3,4 + srdi 4,28,7 + xor 5,5,0 + srdi 0,25,6 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + ld 0,88(7) + add 27,27,3 + add 27,27,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,29,1 + rotrdi 4,29,8 + rotrdi 5,26,19 + rotrdi 0,26,61 + xor 3,3,4 + srdi 4,29,7 + xor 5,5,0 + srdi 0,26,6 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + ld 0,96(7) + add 28,28,3 + add 28,28,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,30,1 + rotrdi 4,30,8 + rotrdi 5,27,19 + rotrdi 0,27,61 + xor 3,3,4 + srdi 4,30,7 + xor 5,5,0 + srdi 0,27,6 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + ld 0,104(7) + add 29,29,3 + add 29,29,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,31,1 + rotrdi 4,31,8 + rotrdi 5,28,19 + rotrdi 0,28,61 + xor 3,3,4 + srdi 4,31,7 + xor 5,5,0 + srdi 0,28,6 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + ld 0,112(7) + add 30,30,3 + add 30,30,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,16,1 + rotrdi 4,16,8 + rotrdi 5,29,19 + rotrdi 0,29,61 + xor 3,3,4 + srdi 4,16,7 + xor 5,5,0 + srdi 0,29,6 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + ld 0,120(7) + add 31,31,3 + add 31,31,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,208(1) + ld 31,200(1) + ld 5,192(1) + subi 7,7,512 + + ld 16,0(3) + ld 17,8(3) + ld 18,16(3) + ld 19,24(3) + ld 20,32(3) + ld 21,40(3) + ld 22,48(3) + addi 31,31,128 + ld 23,56(3) + add 8,8,16 + add 9,9,17 + std 31,200(1) + add 10,10,18 + std 8,0(3) + add 11,11,19 + std 9,8(3) + add 12,12,20 + std 10,16(3) + add 6,6,21 + std 11,24(3) + add 14,14,22 + std 12,32(3) + add 15,15,23 + std 6,40(3) + std 14,48(3) + cmpld 31,5 + std 15,56(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size .zfs_sha512_ppc,.-.zfs_sha512_ppc +.size zfs_sha512_ppc,.-.zfs_sha512_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0x428a2f98,0xd728ae22 +.long 0x71374491,0x23ef65cd +.long 0xb5c0fbcf,0xec4d3b2f +.long 0xe9b5dba5,0x8189dbbc +.long 0x3956c25b,0xf348b538 +.long 0x59f111f1,0xb605d019 +.long 0x923f82a4,0xaf194f9b +.long 0xab1c5ed5,0xda6d8118 +.long 0xd807aa98,0xa3030242 +.long 0x12835b01,0x45706fbe +.long 0x243185be,0x4ee4b28c +.long 0x550c7dc3,0xd5ffb4e2 +.long 0x72be5d74,0xf27b896f +.long 0x80deb1fe,0x3b1696b1 +.long 0x9bdc06a7,0x25c71235 +.long 0xc19bf174,0xcf692694 +.long 0xe49b69c1,0x9ef14ad2 +.long 0xefbe4786,0x384f25e3 +.long 0x0fc19dc6,0x8b8cd5b5 +.long 0x240ca1cc,0x77ac9c65 +.long 0x2de92c6f,0x592b0275 +.long 0x4a7484aa,0x6ea6e483 +.long 0x5cb0a9dc,0xbd41fbd4 +.long 0x76f988da,0x831153b5 +.long 0x983e5152,0xee66dfab +.long 0xa831c66d,0x2db43210 +.long 0xb00327c8,0x98fb213f +.long 0xbf597fc7,0xbeef0ee4 +.long 0xc6e00bf3,0x3da88fc2 +.long 0xd5a79147,0x930aa725 +.long 0x06ca6351,0xe003826f +.long 0x14292967,0x0a0e6e70 +.long 0x27b70a85,0x46d22ffc +.long 0x2e1b2138,0x5c26c926 +.long 0x4d2c6dfc,0x5ac42aed +.long 0x53380d13,0x9d95b3df +.long 0x650a7354,0x8baf63de +.long 0x766a0abb,0x3c77b2a8 +.long 0x81c2c92e,0x47edaee6 +.long 0x92722c85,0x1482353b +.long 0xa2bfe8a1,0x4cf10364 +.long 0xa81a664b,0xbc423001 +.long 0xc24b8b70,0xd0f89791 +.long 0xc76c51a3,0x0654be30 +.long 0xd192e819,0xd6ef5218 +.long 0xd6990624,0x5565a910 +.long 0xf40e3585,0x5771202a +.long 0x106aa070,0x32bbd1b8 +.long 0x19a4c116,0xb8d2d0c8 +.long 0x1e376c08,0x5141ab53 +.long 0x2748774c,0xdf8eeb99 +.long 0x34b0bcb5,0xe19b48a8 +.long 0x391c0cb3,0xc5c95a63 +.long 0x4ed8aa4a,0xe3418acb +.long 0x5b9cca4f,0x7763e373 +.long 0x682e6ff3,0xd6b2b8a3 +.long 0x748f82ee,0x5defb2fc +.long 0x78a5636f,0x43172f60 +.long 0x84c87814,0xa1f0ab72 +.long 0x8cc70208,0x1a6439ec +.long 0x90befffa,0x23631e28 +.long 0xa4506ceb,0xde82bde9 +.long 0xbef9a3f7,0xb2c67915 +.long 0xc67178f2,0xe372532b +.long 0xca273ece,0xea26619c +.long 0xd186b8c7,0x21c0c207 +.long 0xeada7dd6,0xcde0eb1e +.long 0xf57d4f7f,0xee6ed178 +.long 0x06f067aa,0x72176fba +.long 0x0a637dc5,0xa2c898a6 +.long 0x113f9804,0xbef90dae +.long 0x1b710b35,0x131c471b +.long 0x28db77f5,0x23047d84 +.long 0x32caab7b,0x40c72493 +.long 0x3c9ebe0a,0x15c9bebc +.long 0x431d67c4,0x9c100d4c +.long 0x4cc5d4be,0xcb3e42b6 +.long 0x597f299c,0xfc657e2a +.long 0x5fcb6fab,0x3ad6faec +.long 0x6c44198c,0x4a475817 + +#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +.abiversion 2 +.text + +.globl zfs_sha512_ppc +.type zfs_sha512_ppc,@function +.align 6 +zfs_sha512_ppc: +.localentry zfs_sha512_ppc,0 + + stdu 1,-384(1) + mflr 0 + sldi 5,5,7 + + std 3,208(1) + + std 14,240(1) + std 15,248(1) + std 16,256(1) + std 17,264(1) + std 18,272(1) + std 19,280(1) + std 20,288(1) + std 21,296(1) + std 22,304(1) + std 23,312(1) + std 24,320(1) + std 25,328(1) + std 26,336(1) + std 27,344(1) + std 28,352(1) + std 29,360(1) + std 30,368(1) + std 31,376(1) + std 0,400(1) + ld 8,0(3) + mr 31,4 + ld 9,8(3) + ld 10,16(3) + ld 11,24(3) + ld 12,32(3) + ld 6,40(3) + ld 14,48(3) + ld 15,56(3) + bl .LPICmeup +.LPICedup: + andi. 0,31,3 + bne .Lunaligned +.Laligned: + add 5,31,5 + std 5,192(1) + std 31,200(1) + bl .Lsha2_block_private + b .Ldone + +.align 4 +.Lunaligned: + subfic 0,31,4096 + andi. 0,0,3968 + beq .Lcross_page + cmpld 5,0 + ble .Laligned + subfc 5,0,5 + add 0,31,0 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + + ld 5,184(1) +.Lcross_page: + li 0,32 + mtctr 0 + addi 20,1,48 +.Lmemcpy: + lbz 16,0(31) + lbz 17,1(31) + lbz 18,2(31) + lbz 19,3(31) + addi 31,31,4 + stb 16,0(20) + stb 17,1(20) + stb 18,2(20) + stb 19,3(20) + addi 20,20,4 + bdnz .Lmemcpy + std 31,176(1) + addi 0,1,176 + addi 31,1,48 + std 5,184(1) + std 0,192(1) + std 31,200(1) + bl .Lsha2_block_private + ld 31,176(1) + ld 5,184(1) + addic. 5,5,-128 + bne .Lunaligned + +.Ldone: + ld 0,400(1) + ld 14,240(1) + ld 15,248(1) + ld 16,256(1) + ld 17,264(1) + ld 18,272(1) + ld 19,280(1) + ld 20,288(1) + ld 21,296(1) + ld 22,304(1) + ld 23,312(1) + ld 24,320(1) + ld 25,328(1) + ld 26,336(1) + ld 27,344(1) + ld 28,352(1) + ld 29,360(1) + ld 30,368(1) + ld 31,376(1) + mtlr 0 + addi 1,1,384 + blr +.long 0 +.byte 0,12,4,1,0x80,18,3,0 +.long 0 +.align 4 +.Lsha2_block_private: + ld 0,0(7) + lwz 3,0(31) + lwz 4,4(31) + rotlwi 5,3,8 + rotlwi 16,4,8 + rlwimi 5,3,24,0,7 + rlwimi 16,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 16,4,24,16,23 + insrdi 16,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,8(7) + add 15,15,3 + add 15,15,5 + + lwz 3,8(31) + lwz 4,12(31) + rotlwi 5,3,8 + rotlwi 17,4,8 + rlwimi 5,3,24,0,7 + rlwimi 17,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 17,4,24,16,23 + insrdi 17,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,16(7) + add 14,14,3 + add 14,14,5 + + lwz 3,16(31) + lwz 4,20(31) + rotlwi 5,3,8 + rotlwi 18,4,8 + rlwimi 5,3,24,0,7 + rlwimi 18,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 18,4,24,16,23 + insrdi 18,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,24(7) + add 6,6,3 + add 6,6,5 + + lwz 3,24(31) + lwz 4,28(31) + rotlwi 5,3,8 + rotlwi 19,4,8 + rlwimi 5,3,24,0,7 + rlwimi 19,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 19,4,24,16,23 + insrdi 19,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,32(7) + add 12,12,3 + add 12,12,5 + + lwz 3,32(31) + lwz 4,36(31) + rotlwi 5,3,8 + rotlwi 20,4,8 + rlwimi 5,3,24,0,7 + rlwimi 20,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 20,4,24,16,23 + insrdi 20,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,40(7) + add 11,11,3 + add 11,11,5 + + lwz 3,40(31) + lwz 4,44(31) + rotlwi 5,3,8 + rotlwi 21,4,8 + rlwimi 5,3,24,0,7 + rlwimi 21,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 21,4,24,16,23 + insrdi 21,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,48(7) + add 10,10,3 + add 10,10,5 + + lwz 3,48(31) + lwz 4,52(31) + rotlwi 5,3,8 + rotlwi 22,4,8 + rlwimi 5,3,24,0,7 + rlwimi 22,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 22,4,24,16,23 + insrdi 22,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,56(7) + add 9,9,3 + add 9,9,5 + + lwz 3,56(31) + lwz 4,60(31) + rotlwi 5,3,8 + rotlwi 23,4,8 + rlwimi 5,3,24,0,7 + rlwimi 23,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 23,4,24,16,23 + insrdi 23,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + ld 0,64(7) + add 8,8,3 + add 8,8,5 + + lwz 3,64(31) + lwz 4,68(31) + rotlwi 5,3,8 + rotlwi 24,4,8 + rlwimi 5,3,24,0,7 + rlwimi 24,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 24,4,24,16,23 + insrdi 24,5,32,0 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + ld 0,72(7) + add 15,15,3 + add 15,15,5 + + lwz 3,72(31) + lwz 4,76(31) + rotlwi 5,3,8 + rotlwi 25,4,8 + rlwimi 5,3,24,0,7 + rlwimi 25,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 25,4,24,16,23 + insrdi 25,5,32,0 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + ld 0,80(7) + add 14,14,3 + add 14,14,5 + + lwz 3,80(31) + lwz 4,84(31) + rotlwi 5,3,8 + rotlwi 26,4,8 + rlwimi 5,3,24,0,7 + rlwimi 26,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 26,4,24,16,23 + insrdi 26,5,32,0 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + ld 0,88(7) + add 6,6,3 + add 6,6,5 + + lwz 3,88(31) + lwz 4,92(31) + rotlwi 5,3,8 + rotlwi 27,4,8 + rlwimi 5,3,24,0,7 + rlwimi 27,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 27,4,24,16,23 + insrdi 27,5,32,0 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + ld 0,96(7) + add 12,12,3 + add 12,12,5 + + lwz 3,96(31) + lwz 4,100(31) + rotlwi 5,3,8 + rotlwi 28,4,8 + rlwimi 5,3,24,0,7 + rlwimi 28,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 28,4,24,16,23 + insrdi 28,5,32,0 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + ld 0,104(7) + add 11,11,3 + add 11,11,5 + + lwz 3,104(31) + lwz 4,108(31) + rotlwi 5,3,8 + rotlwi 29,4,8 + rlwimi 5,3,24,0,7 + rlwimi 29,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 29,4,24,16,23 + insrdi 29,5,32,0 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + ld 0,112(7) + add 10,10,3 + add 10,10,5 + + lwz 3,112(31) + lwz 4,116(31) + rotlwi 5,3,8 + rotlwi 30,4,8 + rlwimi 5,3,24,0,7 + rlwimi 30,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 30,4,24,16,23 + insrdi 30,5,32,0 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + ld 0,120(7) + add 9,9,3 + add 9,9,5 + + lwz 3,120(31) + lwz 4,124(31) + rotlwi 5,3,8 + rotlwi 31,4,8 + rlwimi 5,3,24,0,7 + rlwimi 31,4,24,0,7 + rlwimi 5,3,24,16,23 + rlwimi 31,4,24,16,23 + insrdi 31,5,32,0 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + li 5,4 + mtctr 5 +.align 4 +.Lrounds: + addi 7,7,128 + rotrdi 3,17,1 + rotrdi 4,17,8 + rotrdi 5,30,19 + rotrdi 0,30,61 + xor 3,3,4 + srdi 4,17,7 + xor 5,5,0 + srdi 0,30,6 + add 16,16,25 + xor 3,3,4 + xor 5,5,0 + ld 0,0(7) + add 16,16,3 + add 16,16,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,16 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,18,1 + rotrdi 4,18,8 + rotrdi 5,31,19 + rotrdi 0,31,61 + xor 3,3,4 + srdi 4,18,7 + xor 5,5,0 + srdi 0,31,6 + add 17,17,26 + xor 3,3,4 + xor 5,5,0 + ld 0,8(7) + add 17,17,3 + add 17,17,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,17 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,19,1 + rotrdi 4,19,8 + rotrdi 5,16,19 + rotrdi 0,16,61 + xor 3,3,4 + srdi 4,19,7 + xor 5,5,0 + srdi 0,16,6 + add 18,18,27 + xor 3,3,4 + xor 5,5,0 + ld 0,16(7) + add 18,18,3 + add 18,18,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,18 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,20,1 + rotrdi 4,20,8 + rotrdi 5,17,19 + rotrdi 0,17,61 + xor 3,3,4 + srdi 4,20,7 + xor 5,5,0 + srdi 0,17,6 + add 19,19,28 + xor 3,3,4 + xor 5,5,0 + ld 0,24(7) + add 19,19,3 + add 19,19,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,19 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,21,1 + rotrdi 4,21,8 + rotrdi 5,18,19 + rotrdi 0,18,61 + xor 3,3,4 + srdi 4,21,7 + xor 5,5,0 + srdi 0,18,6 + add 20,20,29 + xor 3,3,4 + xor 5,5,0 + ld 0,32(7) + add 20,20,3 + add 20,20,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,20 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,22,1 + rotrdi 4,22,8 + rotrdi 5,19,19 + rotrdi 0,19,61 + xor 3,3,4 + srdi 4,22,7 + xor 5,5,0 + srdi 0,19,6 + add 21,21,30 + xor 3,3,4 + xor 5,5,0 + ld 0,40(7) + add 21,21,3 + add 21,21,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,21 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,23,1 + rotrdi 4,23,8 + rotrdi 5,20,19 + rotrdi 0,20,61 + xor 3,3,4 + srdi 4,23,7 + xor 5,5,0 + srdi 0,20,6 + add 22,22,31 + xor 3,3,4 + xor 5,5,0 + ld 0,48(7) + add 22,22,3 + add 22,22,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,22 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,24,1 + rotrdi 4,24,8 + rotrdi 5,21,19 + rotrdi 0,21,61 + xor 3,3,4 + srdi 4,24,7 + xor 5,5,0 + srdi 0,21,6 + add 23,23,16 + xor 3,3,4 + xor 5,5,0 + ld 0,56(7) + add 23,23,3 + add 23,23,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,23 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + rotrdi 3,25,1 + rotrdi 4,25,8 + rotrdi 5,22,19 + rotrdi 0,22,61 + xor 3,3,4 + srdi 4,25,7 + xor 5,5,0 + srdi 0,22,6 + add 24,24,17 + xor 3,3,4 + xor 5,5,0 + ld 0,64(7) + add 24,24,3 + add 24,24,5 + rotrdi 3,12,14 + rotrdi 4,12,18 + and 5,6,12 + xor 3,3,4 + add 15,15,0 + andc 0,14,12 + rotrdi 4,4,23 + or 5,5,0 + add 15,15,24 + xor 3,3,4 + add 15,15,5 + add 15,15,3 + + rotrdi 3,8,28 + rotrdi 4,8,34 + and 5,8,9 + and 0,8,10 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,9,10 + xor 3,3,4 + add 11,11,15 + xor 5,5,0 + add 15,15,3 + add 15,15,5 + + rotrdi 3,26,1 + rotrdi 4,26,8 + rotrdi 5,23,19 + rotrdi 0,23,61 + xor 3,3,4 + srdi 4,26,7 + xor 5,5,0 + srdi 0,23,6 + add 25,25,18 + xor 3,3,4 + xor 5,5,0 + ld 0,72(7) + add 25,25,3 + add 25,25,5 + rotrdi 3,11,14 + rotrdi 4,11,18 + and 5,12,11 + xor 3,3,4 + add 14,14,0 + andc 0,6,11 + rotrdi 4,4,23 + or 5,5,0 + add 14,14,25 + xor 3,3,4 + add 14,14,5 + add 14,14,3 + + rotrdi 3,15,28 + rotrdi 4,15,34 + and 5,15,8 + and 0,15,9 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,8,9 + xor 3,3,4 + add 10,10,14 + xor 5,5,0 + add 14,14,3 + add 14,14,5 + + rotrdi 3,27,1 + rotrdi 4,27,8 + rotrdi 5,24,19 + rotrdi 0,24,61 + xor 3,3,4 + srdi 4,27,7 + xor 5,5,0 + srdi 0,24,6 + add 26,26,19 + xor 3,3,4 + xor 5,5,0 + ld 0,80(7) + add 26,26,3 + add 26,26,5 + rotrdi 3,10,14 + rotrdi 4,10,18 + and 5,11,10 + xor 3,3,4 + add 6,6,0 + andc 0,12,10 + rotrdi 4,4,23 + or 5,5,0 + add 6,6,26 + xor 3,3,4 + add 6,6,5 + add 6,6,3 + + rotrdi 3,14,28 + rotrdi 4,14,34 + and 5,14,15 + and 0,14,8 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,15,8 + xor 3,3,4 + add 9,9,6 + xor 5,5,0 + add 6,6,3 + add 6,6,5 + + rotrdi 3,28,1 + rotrdi 4,28,8 + rotrdi 5,25,19 + rotrdi 0,25,61 + xor 3,3,4 + srdi 4,28,7 + xor 5,5,0 + srdi 0,25,6 + add 27,27,20 + xor 3,3,4 + xor 5,5,0 + ld 0,88(7) + add 27,27,3 + add 27,27,5 + rotrdi 3,9,14 + rotrdi 4,9,18 + and 5,10,9 + xor 3,3,4 + add 12,12,0 + andc 0,11,9 + rotrdi 4,4,23 + or 5,5,0 + add 12,12,27 + xor 3,3,4 + add 12,12,5 + add 12,12,3 + + rotrdi 3,6,28 + rotrdi 4,6,34 + and 5,6,14 + and 0,6,15 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,14,15 + xor 3,3,4 + add 8,8,12 + xor 5,5,0 + add 12,12,3 + add 12,12,5 + + rotrdi 3,29,1 + rotrdi 4,29,8 + rotrdi 5,26,19 + rotrdi 0,26,61 + xor 3,3,4 + srdi 4,29,7 + xor 5,5,0 + srdi 0,26,6 + add 28,28,21 + xor 3,3,4 + xor 5,5,0 + ld 0,96(7) + add 28,28,3 + add 28,28,5 + rotrdi 3,8,14 + rotrdi 4,8,18 + and 5,9,8 + xor 3,3,4 + add 11,11,0 + andc 0,10,8 + rotrdi 4,4,23 + or 5,5,0 + add 11,11,28 + xor 3,3,4 + add 11,11,5 + add 11,11,3 + + rotrdi 3,12,28 + rotrdi 4,12,34 + and 5,12,6 + and 0,12,14 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,6,14 + xor 3,3,4 + add 15,15,11 + xor 5,5,0 + add 11,11,3 + add 11,11,5 + + rotrdi 3,30,1 + rotrdi 4,30,8 + rotrdi 5,27,19 + rotrdi 0,27,61 + xor 3,3,4 + srdi 4,30,7 + xor 5,5,0 + srdi 0,27,6 + add 29,29,22 + xor 3,3,4 + xor 5,5,0 + ld 0,104(7) + add 29,29,3 + add 29,29,5 + rotrdi 3,15,14 + rotrdi 4,15,18 + and 5,8,15 + xor 3,3,4 + add 10,10,0 + andc 0,9,15 + rotrdi 4,4,23 + or 5,5,0 + add 10,10,29 + xor 3,3,4 + add 10,10,5 + add 10,10,3 + + rotrdi 3,11,28 + rotrdi 4,11,34 + and 5,11,12 + and 0,11,6 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,12,6 + xor 3,3,4 + add 14,14,10 + xor 5,5,0 + add 10,10,3 + add 10,10,5 + + rotrdi 3,31,1 + rotrdi 4,31,8 + rotrdi 5,28,19 + rotrdi 0,28,61 + xor 3,3,4 + srdi 4,31,7 + xor 5,5,0 + srdi 0,28,6 + add 30,30,23 + xor 3,3,4 + xor 5,5,0 + ld 0,112(7) + add 30,30,3 + add 30,30,5 + rotrdi 3,14,14 + rotrdi 4,14,18 + and 5,15,14 + xor 3,3,4 + add 9,9,0 + andc 0,8,14 + rotrdi 4,4,23 + or 5,5,0 + add 9,9,30 + xor 3,3,4 + add 9,9,5 + add 9,9,3 + + rotrdi 3,10,28 + rotrdi 4,10,34 + and 5,10,11 + and 0,10,12 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,11,12 + xor 3,3,4 + add 6,6,9 + xor 5,5,0 + add 9,9,3 + add 9,9,5 + + rotrdi 3,16,1 + rotrdi 4,16,8 + rotrdi 5,29,19 + rotrdi 0,29,61 + xor 3,3,4 + srdi 4,16,7 + xor 5,5,0 + srdi 0,29,6 + add 31,31,24 + xor 3,3,4 + xor 5,5,0 + ld 0,120(7) + add 31,31,3 + add 31,31,5 + rotrdi 3,6,14 + rotrdi 4,6,18 + and 5,14,6 + xor 3,3,4 + add 8,8,0 + andc 0,15,6 + rotrdi 4,4,23 + or 5,5,0 + add 8,8,31 + xor 3,3,4 + add 8,8,5 + add 8,8,3 + + rotrdi 3,9,28 + rotrdi 4,9,34 + and 5,9,10 + and 0,9,11 + xor 3,3,4 + rotrdi 4,4,5 + xor 5,5,0 + and 0,10,11 + xor 3,3,4 + add 12,12,8 + xor 5,5,0 + add 8,8,3 + add 8,8,5 + + bdnz .Lrounds + + ld 3,208(1) + ld 31,200(1) + ld 5,192(1) + subi 7,7,512 + + ld 16,0(3) + ld 17,8(3) + ld 18,16(3) + ld 19,24(3) + ld 20,32(3) + ld 21,40(3) + ld 22,48(3) + addi 31,31,128 + ld 23,56(3) + add 8,8,16 + add 9,9,17 + std 31,200(1) + add 10,10,18 + std 8,0(3) + add 11,11,19 + std 9,8(3) + add 12,12,20 + std 10,16(3) + add 6,6,21 + std 11,24(3) + add 14,14,22 + std 12,32(3) + add 15,15,23 + std 6,40(3) + std 14,48(3) + cmpld 31,5 + std 15,56(3) + bne .Lsha2_block_private + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.size zfs_sha512_ppc,.-zfs_sha512_ppc +.align 6 +.LPICmeup: + mflr 0 + bcl 20,31,$+4 + mflr 7 + addi 7,7,56 + mtlr 0 + blr +.long 0 +.byte 0,12,0x14,0,0,0,0,0 +.space 28 +.long 0xd728ae22,0x428a2f98 +.long 0x23ef65cd,0x71374491 +.long 0xec4d3b2f,0xb5c0fbcf +.long 0x8189dbbc,0xe9b5dba5 +.long 0xf348b538,0x3956c25b +.long 0xb605d019,0x59f111f1 +.long 0xaf194f9b,0x923f82a4 +.long 0xda6d8118,0xab1c5ed5 +.long 0xa3030242,0xd807aa98 +.long 0x45706fbe,0x12835b01 +.long 0x4ee4b28c,0x243185be +.long 0xd5ffb4e2,0x550c7dc3 +.long 0xf27b896f,0x72be5d74 +.long 0x3b1696b1,0x80deb1fe +.long 0x25c71235,0x9bdc06a7 +.long 0xcf692694,0xc19bf174 +.long 0x9ef14ad2,0xe49b69c1 +.long 0x384f25e3,0xefbe4786 +.long 0x8b8cd5b5,0x0fc19dc6 +.long 0x77ac9c65,0x240ca1cc +.long 0x592b0275,0x2de92c6f +.long 0x6ea6e483,0x4a7484aa +.long 0xbd41fbd4,0x5cb0a9dc +.long 0x831153b5,0x76f988da +.long 0xee66dfab,0x983e5152 +.long 0x2db43210,0xa831c66d +.long 0x98fb213f,0xb00327c8 +.long 0xbeef0ee4,0xbf597fc7 +.long 0x3da88fc2,0xc6e00bf3 +.long 0x930aa725,0xd5a79147 +.long 0xe003826f,0x06ca6351 +.long 0x0a0e6e70,0x14292967 +.long 0x46d22ffc,0x27b70a85 +.long 0x5c26c926,0x2e1b2138 +.long 0x5ac42aed,0x4d2c6dfc +.long 0x9d95b3df,0x53380d13 +.long 0x8baf63de,0x650a7354 +.long 0x3c77b2a8,0x766a0abb +.long 0x47edaee6,0x81c2c92e +.long 0x1482353b,0x92722c85 +.long 0x4cf10364,0xa2bfe8a1 +.long 0xbc423001,0xa81a664b +.long 0xd0f89791,0xc24b8b70 +.long 0x0654be30,0xc76c51a3 +.long 0xd6ef5218,0xd192e819 +.long 0x5565a910,0xd6990624 +.long 0x5771202a,0xf40e3585 +.long 0x32bbd1b8,0x106aa070 +.long 0xb8d2d0c8,0x19a4c116 +.long 0x5141ab53,0x1e376c08 +.long 0xdf8eeb99,0x2748774c +.long 0xe19b48a8,0x34b0bcb5 +.long 0xc5c95a63,0x391c0cb3 +.long 0xe3418acb,0x4ed8aa4a +.long 0x7763e373,0x5b9cca4f +.long 0xd6b2b8a3,0x682e6ff3 +.long 0x5defb2fc,0x748f82ee +.long 0x43172f60,0x78a5636f +.long 0xa1f0ab72,0x84c87814 +.long 0x1a6439ec,0x8cc70208 +.long 0x23631e28,0x90befffa +.long 0xde82bde9,0xa4506ceb +.long 0xb2c67915,0xbef9a3f7 +.long 0xe372532b,0xc67178f2 +.long 0xea26619c,0xca273ece +.long 0x21c0c207,0xd186b8c7 +.long 0xcde0eb1e,0xeada7dd6 +.long 0xee6ed178,0xf57d4f7f +.long 0x72176fba,0x06f067aa +.long 0xa2c898a6,0x0a637dc5 +.long 0xbef90dae,0x113f9804 +.long 0x131c471b,0x1b710b35 +.long 0x23047d84,0x28db77f5 +.long 0x40c72493,0x32caab7b +.long 0x15c9bebc,0x3c9ebe0a +.long 0x9c100d4c,0x431d67c4 +.long 0xcb3e42b6,0x4cc5d4be +.long 0xfc657e2a,0x597f299c +.long 0x3ad6faec,0x5fcb6fab +.long 0x4a475817,0x6c44198c + +#endif diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S new file mode 100644 index 000000000000..92cf3547093e --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha256-x86_64.S @@ -0,0 +1,3742 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__x86_64) + +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl sha256_transform_shani +.type sha256_transform_shani,@function +.align 64 +sha256_transform_shani: +.cfi_startproc + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shani + +.align 16 +.Loop_shani: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shani + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_transform_shani,.-sha256_transform_shani +.globl sha256_transform_ssse3 +.type sha256_transform_ssse3,@function +.align 64 +sha256_transform_ssse3: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp +.byte 102,15,56,0,207 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 +.byte 102,15,56,0,223 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_transform_ssse3,.-sha256_transform_ssse3 +.globl sha256_transform_avx +.type sha256_transform_avx,@function +.align 64 +sha256_transform_avx: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_transform_avx,.-sha256_transform_avx +.globl sha256_transform_avx2 +.type sha256_transform_avx2,@function +.align 64 +sha256_transform_avx2: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 88(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -64(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 88(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_transform_avx2,.-sha256_transform_avx2 +#if defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif +#endif diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S new file mode 100644 index 000000000000..f1d908aaa450 --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S @@ -0,0 +1,2650 @@ +/* + * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#if defined(__x86_64) + +.text + +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + +.globl sha512_transform_avx +.type sha512_transform_avx,@function +.align 64 +sha512_transform_avx: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_transform_avx,.-sha512_transform_avx +.globl sha512_transform_avx2 +.type sha512_transform_avx2,@function +.align 64 +sha512_transform_avx2: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + + movq 152(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -128(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08 + +.Ldone_avx2: + movq 152(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_transform_avx2,.-sha512_transform_avx2 +#if defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif +#endif diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c index 13f05c06ed5c..caa0e790469b 100644 --- a/module/icp/illumos-crypto.c +++ b/module/icp/illumos-crypto.c @@ -108,7 +108,6 @@ void icp_fini(void) { skein_mod_fini(); - sha2_mod_fini(); aes_mod_fini(); kcf_sched_destroy(); kcf_prov_tab_destroy(); @@ -133,7 +132,6 @@ icp_init(void) /* initialize algorithms */ aes_mod_init(); - sha2_mod_init(); skein_mod_init(); return (0); diff --git a/module/icp/include/generic_impl.c b/module/icp/include/generic_impl.c new file mode 100644 index 000000000000..38f0187815c9 --- /dev/null +++ b/module/icp/include/generic_impl.c @@ -0,0 +1,240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010 Oracle and/or its affiliates. + * Copyright (c) 2022 Tino Reichardt + */ + +/* + * This file gets included by c files for implementing the full set + * of zfs_impl.h defines. + * + * It's ment for easier maintaining multiple implementations of + * algorithms. + */ + +#include +#include +#include + +/* Two default implementations */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +/* Indicate that implementation has been initialized */ +static boolean_t generic_initialized = B_FALSE; + +/* Implementation that contains the fastest method */ +static IMPL_OPS_T generic_fastest_impl = { + .name = "fastest" +}; + +/* Hold all supported implementations */ +static const IMPL_OPS_T *generic_supp_impls[ARRAY_SIZE(IMPL_ARRAY)]; +static uint32_t generic_supp_impls_cnt = 0; + +/* Currently selected implementation */ +static uint32_t generic_impl_chosen = IMPL_FASTEST; + +static struct generic_impl_selector { + const char *name; + uint32_t sel; +} generic_impl_selectors[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST } +}; + +/* check the supported implementations */ +static void generic_impl_init(void) +{ + int i, c; + + /* init only once */ + if (likely(generic_initialized)) + return; + + /* Move supported implementations into generic_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(IMPL_ARRAY); i++) { + const IMPL_OPS_T *impl = IMPL_ARRAY[i]; + + if (impl->is_supported && impl->is_supported()) + generic_supp_impls[c++] = impl; + } + generic_supp_impls_cnt = c; + + /* first init generic impl, may be changed via set_fastest() */ + memcpy(&generic_fastest_impl, generic_supp_impls[0], + sizeof (generic_fastest_impl)); + generic_initialized = B_TRUE; +} + +/* get number of supported implementations */ +static uint32_t +generic_impl_getcnt(void) +{ + generic_impl_init(); + return (generic_supp_impls_cnt); +} + +/* get id of selected implementation */ +static uint32_t +generic_impl_getid(void) +{ + generic_impl_init(); + return (IMPL_READ(generic_impl_chosen)); +} + +/* get name of selected implementation */ +static const char * +generic_impl_getname(void) +{ + uint32_t impl = IMPL_READ(generic_impl_chosen); + + generic_impl_init(); + switch (impl) { + case IMPL_FASTEST: + return ("fastest"); + case IMPL_CYCLE: + return ("cycle"); + default: + return (generic_supp_impls[impl]->name); + } +} + +/* set implementation by id */ +static void +generic_impl_setid(uint32_t id) +{ + generic_impl_init(); + switch (id) { + case IMPL_FASTEST: + atomic_swap_32(&generic_impl_chosen, IMPL_FASTEST); + break; + case IMPL_CYCLE: + atomic_swap_32(&generic_impl_chosen, IMPL_CYCLE); + break; + default: + ASSERT3U(id, >=, 0); + ASSERT3U(id, <, generic_supp_impls_cnt); + atomic_swap_32(&generic_impl_chosen, id); + break; + } +} + +/* set implementation by name */ +static int +generic_impl_setname(const char *val) +{ + uint32_t impl = IMPL_READ(generic_impl_chosen); + size_t val_len; + int i, err = -EINVAL; + + generic_impl_init(); + val_len = strlen(val); + while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ + val_len--; + + /* check mandatory implementations */ + for (i = 0; i < ARRAY_SIZE(generic_impl_selectors); i++) { + const char *name = generic_impl_selectors[i].name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = generic_impl_selectors[i].sel; + err = 0; + break; + } + } + + if (err != 0 && generic_initialized) { + /* check all supported implementations */ + for (i = 0; i < generic_supp_impls_cnt; i++) { + const char *name = generic_supp_impls[i]->name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) + atomic_swap_32(&generic_impl_chosen, impl); + + return (err); +} + +/* setup id as fastest implementation */ +static void +generic_impl_set_fastest(uint32_t id) +{ + /* setup fastest impl */ + memcpy(&generic_fastest_impl, generic_supp_impls[id], + sizeof (generic_fastest_impl)); +} + +/* return impl iterating functions */ +const zfs_impl_t ZFS_IMPL_OPS = { + .name = IMPL_NAME, + .getcnt = generic_impl_getcnt, + .getid = generic_impl_getid, + .getname = generic_impl_getname, + .set_fastest = generic_impl_set_fastest, + .setid = generic_impl_setid, + .setname = generic_impl_setname +}; + +/* get impl ops_t of selected implementation */ +const IMPL_OPS_T * +IMPL_GET_OPS(void) +{ + const IMPL_OPS_T *ops = NULL; + uint32_t impl = IMPL_READ(generic_impl_chosen); + + generic_impl_init(); + switch (impl) { + case IMPL_FASTEST: + ASSERT(generic_initialized); + ops = &generic_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through supported implementations */ + ASSERT(generic_initialized); + ASSERT3U(generic_supp_impls_cnt, >, 0); + static uint32_t cycle_count = 0; + uint32_t idx = (++cycle_count) % generic_supp_impls_cnt; + ops = generic_supp_impls[idx]; + break; + default: + ASSERT3U(generic_supp_impls_cnt, >, 0); + ASSERT3U(impl, <, generic_supp_impls_cnt); + ops = generic_supp_impls[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + return (ops); +} diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c deleted file mode 100644 index fadb58b81881..000000000000 --- a/module/icp/io/sha2_mod.c +++ /dev/null @@ -1,1290 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#define _SHA2_IMPL -#include -#include - -/* - * Macros to access the SHA2 or SHA2-HMAC contexts from a context passed - * by KCF to one of the entry points. - */ - -#define PROV_SHA2_CTX(ctx) ((sha2_ctx_t *)(ctx)->cc_provider_private) -#define PROV_SHA2_HMAC_CTX(ctx) ((sha2_hmac_ctx_t *)(ctx)->cc_provider_private) - -/* to extract the digest length passed as mechanism parameter */ -#define PROV_SHA2_GET_DIGEST_LEN(m, len) { \ - if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t))) \ - (len) = (uint32_t)*((ulong_t *)(m)->cm_param); \ - else { \ - ulong_t tmp_ulong; \ - memcpy(&tmp_ulong, (m)->cm_param, sizeof (ulong_t)); \ - (len) = (uint32_t)tmp_ulong; \ - } \ -} - -#define PROV_SHA2_DIGEST_KEY(mech, ctx, key, len, digest) { \ - SHA2Init(mech, ctx); \ - SHA2Update(ctx, key, len); \ - SHA2Final(digest, ctx); \ -} - -/* - * Mechanism info structure passed to KCF during registration. - */ -static const crypto_mech_info_t sha2_mech_info_tab[] = { - /* SHA256 */ - {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, - /* SHA256-HMAC */ - {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA256-HMAC GENERAL */ - {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA384 */ - {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, - /* SHA384-HMAC */ - {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA384-HMAC GENERAL */ - {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA512 */ - {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE, - CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC}, - /* SHA512-HMAC */ - {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, - /* SHA512-HMAC GENERAL */ - {SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE, - CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC}, -}; - -static int sha2_digest_init(crypto_ctx_t *, crypto_mechanism_t *); -static int sha2_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *); -static int sha2_digest_update(crypto_ctx_t *, crypto_data_t *); -static int sha2_digest_final(crypto_ctx_t *, crypto_data_t *); -static int sha2_digest_atomic(crypto_mechanism_t *, crypto_data_t *, - crypto_data_t *); - -static const crypto_digest_ops_t sha2_digest_ops = { - .digest_init = sha2_digest_init, - .digest = sha2_digest, - .digest_update = sha2_digest_update, - .digest_final = sha2_digest_final, - .digest_atomic = sha2_digest_atomic -}; - -static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t); -static int sha2_mac_update(crypto_ctx_t *, crypto_data_t *); -static int sha2_mac_final(crypto_ctx_t *, crypto_data_t *); -static int sha2_mac_atomic(crypto_mechanism_t *, crypto_key_t *, - crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); -static int sha2_mac_verify_atomic(crypto_mechanism_t *, crypto_key_t *, - crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t); - -static const crypto_mac_ops_t sha2_mac_ops = { - .mac_init = sha2_mac_init, - .mac = NULL, - .mac_update = sha2_mac_update, - .mac_final = sha2_mac_final, - .mac_atomic = sha2_mac_atomic, - .mac_verify_atomic = sha2_mac_verify_atomic -}; - -static int sha2_create_ctx_template(crypto_mechanism_t *, crypto_key_t *, - crypto_spi_ctx_template_t *, size_t *); -static int sha2_free_context(crypto_ctx_t *); - -static const crypto_ctx_ops_t sha2_ctx_ops = { - .create_ctx_template = sha2_create_ctx_template, - .free_context = sha2_free_context -}; - -static const crypto_ops_t sha2_crypto_ops = { - &sha2_digest_ops, - NULL, - &sha2_mac_ops, - &sha2_ctx_ops, -}; - -static const crypto_provider_info_t sha2_prov_info = { - "SHA2 Software Provider", - &sha2_crypto_ops, - sizeof (sha2_mech_info_tab) / sizeof (crypto_mech_info_t), - sha2_mech_info_tab -}; - -static crypto_kcf_provider_handle_t sha2_prov_handle = 0; - -int -sha2_mod_init(void) -{ - int ret; - - /* - * Register with KCF. If the registration fails, log an - * error but do not uninstall the module, since the functionality - * provided by misc/sha2 should still be available. - */ - if ((ret = crypto_register_provider(&sha2_prov_info, - &sha2_prov_handle)) != CRYPTO_SUCCESS) - cmn_err(CE_WARN, "sha2 _init: " - "crypto_register_provider() failed (0x%x)", ret); - - return (0); -} - -int -sha2_mod_fini(void) -{ - int ret = 0; - - if (sha2_prov_handle != 0) { - if ((ret = crypto_unregister_provider(sha2_prov_handle)) != - CRYPTO_SUCCESS) { - cmn_err(CE_WARN, - "sha2 _fini: crypto_unregister_provider() " - "failed (0x%x)", ret); - return (EBUSY); - } - sha2_prov_handle = 0; - } - - return (ret); -} - -/* - * KCF software provider digest entry points. - */ - -static int -sha2_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism) -{ - - /* - * Allocate and initialize SHA2 context. - */ - ctx->cc_provider_private = kmem_alloc(sizeof (sha2_ctx_t), KM_SLEEP); - if (ctx->cc_provider_private == NULL) - return (CRYPTO_HOST_MEMORY); - - PROV_SHA2_CTX(ctx)->sc_mech_type = mechanism->cm_type; - SHA2Init(mechanism->cm_type, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - - return (CRYPTO_SUCCESS); -} - -/* - * Helper SHA2 digest update function for uio data. - */ -static int -sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) -{ - off_t offset = data->cd_offset; - size_t length = data->cd_length; - uint_t vec_idx = 0; - size_t cur_len; - - /* we support only kernel buffer */ - if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE) - return (CRYPTO_ARGUMENTS_BAD); - - /* - * Jump to the first iovec containing data to be - * digested. - */ - offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx); - if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) { - /* - * The caller specified an offset that is larger than the - * total size of the buffers it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - - /* - * Now do the digesting on the iovecs. - */ - while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) { - cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) - - offset, length); - - SHA2Update(sha2_ctx, (uint8_t *)zfs_uio_iovbase(data->cd_uio, - vec_idx) + offset, cur_len); - length -= cur_len; - vec_idx++; - offset = 0; - } - - if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) { - /* - * The end of the specified iovec's was reached but - * the length requested could not be processed, i.e. - * The caller requested to digest more data than it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - - return (CRYPTO_SUCCESS); -} - -/* - * Helper SHA2 digest final function for uio data. - * digest_len is the length of the desired digest. If digest_len - * is smaller than the default SHA2 digest length, the caller - * must pass a scratch buffer, digest_scratch, which must - * be at least the algorithm's digest length bytes. - */ -static int -sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, - ulong_t digest_len, uchar_t *digest_scratch) -{ - off_t offset = digest->cd_offset; - uint_t vec_idx = 0; - - /* we support only kernel buffer */ - if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE) - return (CRYPTO_ARGUMENTS_BAD); - - /* - * Jump to the first iovec containing ptr to the digest to - * be returned. - */ - offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx); - if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) { - /* - * The caller specified an offset that is - * larger than the total size of the buffers - * it provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - - if (offset + digest_len <= - zfs_uio_iovlen(digest->cd_uio, vec_idx)) { - /* - * The computed SHA2 digest will fit in the current - * iovec. - */ - if (((sha2_ctx->algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) && - (digest_len != SHA256_DIGEST_LENGTH)) || - ((sha2_ctx->algotype > SHA256_HMAC_GEN_MECH_INFO_TYPE) && - (digest_len != SHA512_DIGEST_LENGTH))) { - /* - * The caller requested a short digest. Digest - * into a scratch buffer and return to - * the user only what was requested. - */ - SHA2Final(digest_scratch, sha2_ctx); - - memcpy((uchar_t *) - zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, - digest_scratch, digest_len); - } else { - SHA2Final((uchar_t *)zfs_uio_iovbase(digest-> - cd_uio, vec_idx) + offset, - sha2_ctx); - - } - } else { - /* - * The computed digest will be crossing one or more iovec's. - * This is bad performance-wise but we need to support it. - * Allocate a small scratch buffer on the stack and - * copy it piece meal to the specified digest iovec's. - */ - uchar_t digest_tmp[SHA512_DIGEST_LENGTH]; - off_t scratch_offset = 0; - size_t length = digest_len; - size_t cur_len; - - SHA2Final(digest_tmp, sha2_ctx); - - while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) { - cur_len = - MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) - - offset, length); - memcpy( - zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, - digest_tmp + scratch_offset, - cur_len); - - length -= cur_len; - vec_idx++; - scratch_offset += cur_len; - offset = 0; - } - - if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) { - /* - * The end of the specified iovec's was reached but - * the length requested could not be processed, i.e. - * The caller requested to digest more data than it - * provided. - */ - return (CRYPTO_DATA_LEN_RANGE); - } - } - - return (CRYPTO_SUCCESS); -} - -static int -sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - uint_t sha_digest_len; - - ASSERT(ctx->cc_provider_private != NULL); - - switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { - case SHA256_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_MECH_INFO_TYPE: - sha_digest_len = SHA384_DIGEST_LENGTH; - break; - case SHA512_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following cases. - */ - if ((digest->cd_length == 0) || - (digest->cd_length < sha_digest_len)) { - digest->cd_length = sha_digest_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - /* - * Do the SHA2 update on the specified input data. - */ - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret != CRYPTO_SUCCESS) { - /* the update failed, free context and bail */ - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - digest->cd_length = 0; - return (ret); - } - - /* - * Do a SHA2 final, must be done separately since the digest - * type can be different than the input data type. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - digest, sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* all done, free context and return */ - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - return (ret); -} - -static int -sha2_digest_update(crypto_ctx_t *ctx, crypto_data_t *data) -{ - int ret = CRYPTO_SUCCESS; - - ASSERT(ctx->cc_provider_private != NULL); - - /* - * Do the SHA2 update on the specified input data. - */ - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - return (ret); -} - -static int -sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - uint_t sha_digest_len; - - ASSERT(ctx->cc_provider_private != NULL); - - switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { - case SHA256_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_MECH_INFO_TYPE: - sha_digest_len = SHA384_DIGEST_LENGTH; - break; - case SHA512_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following cases. - */ - if ((digest->cd_length == 0) || - (digest->cd_length < sha_digest_len)) { - digest->cd_length = sha_digest_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - /* - * Do a SHA2 final. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, - digest, sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* all done, free context and return */ - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); - ctx->cc_provider_private = NULL; - - return (ret); -} - -static int -sha2_digest_atomic(crypto_mechanism_t *mechanism, crypto_data_t *data, - crypto_data_t *digest) -{ - int ret = CRYPTO_SUCCESS; - SHA2_CTX sha2_ctx; - uint32_t sha_digest_len; - - /* - * Do the SHA inits. - */ - - SHA2Init(mechanism->cm_type, &sha2_ctx); - - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&sha2_ctx, (uint8_t *)data-> - cd_raw.iov_base + data->cd_offset, data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio(&sha2_ctx, data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - /* - * Do the SHA updates on the specified input data. - */ - - if (ret != CRYPTO_SUCCESS) { - /* the update failed, bail */ - digest->cd_length = 0; - return (ret); - } - - if (mechanism->cm_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) - sha_digest_len = SHA256_DIGEST_LENGTH; - else - sha_digest_len = SHA512_DIGEST_LENGTH; - - /* - * Do a SHA2 final, must be done separately since the digest - * type can be different than the input data type. - */ - switch (digest->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Final((unsigned char *)digest->cd_raw.iov_base + - digest->cd_offset, &sha2_ctx); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&sha2_ctx, digest, - sha_digest_len, NULL); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) - digest->cd_length = sha_digest_len; - else - digest->cd_length = 0; - - return (ret); -} - -/* - * KCF software provider mac entry points. - * - * SHA2 HMAC is: SHA2(key XOR opad, SHA2(key XOR ipad, text)) - * - * Init: - * The initialization routine initializes what we denote - * as the inner and outer contexts by doing - * - for inner context: SHA2(key XOR ipad) - * - for outer context: SHA2(key XOR opad) - * - * Update: - * Each subsequent SHA2 HMAC update will result in an - * update of the inner context with the specified data. - * - * Final: - * The SHA2 HMAC final will do a SHA2 final operation on the - * inner context, and the resulting digest will be used - * as the data for an update on the outer context. Last - * but not least, a SHA2 final on the outer context will - * be performed to obtain the SHA2 HMAC digest to return - * to the user. - */ - -/* - * Initialize a SHA2-HMAC context. - */ -static void -sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes) -{ - uint64_t ipad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)] = {0}; - uint64_t opad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)] = {0}; - int i, block_size, blocks_per_int64; - - /* Determine the block size */ - if (ctx->hc_mech_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { - block_size = SHA256_HMAC_BLOCK_SIZE; - blocks_per_int64 = SHA256_HMAC_BLOCK_SIZE / sizeof (uint64_t); - } else { - block_size = SHA512_HMAC_BLOCK_SIZE; - blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t); - } - - (void) memset(ipad, 0, block_size); - (void) memset(opad, 0, block_size); - - if (keyval != NULL) { - (void) memcpy(ipad, keyval, length_in_bytes); - (void) memcpy(opad, keyval, length_in_bytes); - } else { - ASSERT0(length_in_bytes); - } - - /* XOR key with ipad (0x36) and opad (0x5c) */ - for (i = 0; i < blocks_per_int64; i ++) { - ipad[i] ^= 0x3636363636363636; - opad[i] ^= 0x5c5c5c5c5c5c5c5c; - } - - /* perform SHA2 on ipad */ - SHA2Init(ctx->hc_mech_type, &ctx->hc_icontext); - SHA2Update(&ctx->hc_icontext, (uint8_t *)ipad, block_size); - - /* perform SHA2 on opad */ - SHA2Init(ctx->hc_mech_type, &ctx->hc_ocontext); - SHA2Update(&ctx->hc_ocontext, (uint8_t *)opad, block_size); -} - -/* - */ -static int -sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_spi_ctx_template_t ctx_template) -{ - int ret = CRYPTO_SUCCESS; - uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); - uint_t sha_digest_len, sha_hmac_block_size; - - /* - * Set the digest length and block size to values appropriate to the - * mechanism - */ - switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - ctx->cc_provider_private = - kmem_alloc(sizeof (sha2_hmac_ctx_t), KM_SLEEP); - if (ctx->cc_provider_private == NULL) - return (CRYPTO_HOST_MEMORY); - - PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type; - if (ctx_template != NULL) { - /* reuse context template */ - memcpy(PROV_SHA2_HMAC_CTX(ctx), ctx_template, - sizeof (sha2_hmac_ctx_t)); - } else { - /* no context template, compute context */ - if (keylen_in_bytes > sha_hmac_block_size) { - uchar_t digested_key[SHA512_DIGEST_LENGTH]; - sha2_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private; - - /* - * Hash the passed-in key to get a smaller key. - * The inner context is used since it hasn't been - * initialized yet. - */ - PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, - &hmac_ctx->hc_icontext, - key->ck_data, keylen_in_bytes, digested_key); - sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx), - digested_key, sha_digest_len); - } else { - sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx), - key->ck_data, keylen_in_bytes); - } - } - - /* - * Get the mechanism parameters, if applicable. - */ - if (mechanism->cm_type % 3 == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) - ret = CRYPTO_MECHANISM_PARAM_INVALID; - PROV_SHA2_GET_DIGEST_LEN(mechanism, - PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len); - if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > sha_digest_len) - ret = CRYPTO_MECHANISM_PARAM_INVALID; - } - - if (ret != CRYPTO_SUCCESS) { - memset(ctx->cc_provider_private, 0, sizeof (sha2_hmac_ctx_t)); - kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); - ctx->cc_provider_private = NULL; - } - - return (ret); -} - -static int -sha2_mac_update(crypto_ctx_t *ctx, crypto_data_t *data) -{ - int ret = CRYPTO_SUCCESS; - - ASSERT(ctx->cc_provider_private != NULL); - - /* - * Do a SHA2 update of the inner context using the specified - * data. - */ - switch (data->cd_format) { - case CRYPTO_DATA_RAW: - SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, - (uint8_t *)data->cd_raw.iov_base + data->cd_offset, - data->cd_length); - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_update_uio( - &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, data); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - return (ret); -} - -static int -sha2_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac) -{ - int ret = CRYPTO_SUCCESS; - uchar_t digest[SHA512_DIGEST_LENGTH]; - uint32_t digest_len, sha_digest_len; - - ASSERT(ctx->cc_provider_private != NULL); - - /* Set the digest lengths to values appropriate to the mechanism */ - switch (PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA384_DIGEST_LENGTH; - break; - case SHA512_HMAC_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; - break; - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; - break; - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; - break; - default: - return (CRYPTO_ARGUMENTS_BAD); - } - - /* - * We need to just return the length needed to store the output. - * We should not destroy the context for the following cases. - */ - if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) { - mac->cd_length = digest_len; - return (CRYPTO_BUFFER_TOO_SMALL); - } - - /* - * Do a SHA2 final on the inner context. - */ - SHA2Final(digest, &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext); - - /* - * Do a SHA2 update on the outer context, feeding the inner - * digest as data. - */ - SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, digest, - sha_digest_len); - - /* - * Do a SHA2 final on the outer context, storing the computing - * digest in the users buffer. - */ - switch (mac->cd_format) { - case CRYPTO_DATA_RAW: - if (digest_len != sha_digest_len) { - /* - * The caller requested a short digest. Digest - * into a scratch buffer and return to - * the user only what was requested. - */ - SHA2Final(digest, - &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext); - memcpy((unsigned char *)mac->cd_raw.iov_base + - mac->cd_offset, digest, digest_len); - } else { - SHA2Final((unsigned char *)mac->cd_raw.iov_base + - mac->cd_offset, - &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext); - } - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio( - &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, mac, - digest_len, digest); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) - mac->cd_length = digest_len; - else - mac->cd_length = 0; - - memset(ctx->cc_provider_private, 0, sizeof (sha2_hmac_ctx_t)); - kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); - ctx->cc_provider_private = NULL; - - return (ret); -} - -#define SHA2_MAC_UPDATE(data, ctx, ret) { \ - switch (data->cd_format) { \ - case CRYPTO_DATA_RAW: \ - SHA2Update(&(ctx).hc_icontext, \ - (uint8_t *)data->cd_raw.iov_base + \ - data->cd_offset, data->cd_length); \ - break; \ - case CRYPTO_DATA_UIO: \ - ret = sha2_digest_update_uio(&(ctx).hc_icontext, data); \ - break; \ - default: \ - ret = CRYPTO_ARGUMENTS_BAD; \ - } \ -} - -static int -sha2_mac_atomic(crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, - crypto_spi_ctx_template_t ctx_template) -{ - int ret = CRYPTO_SUCCESS; - uchar_t digest[SHA512_DIGEST_LENGTH]; - sha2_hmac_ctx_t sha2_hmac_ctx; - uint32_t sha_digest_len, digest_len, sha_hmac_block_size; - uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); - - /* - * Set the digest length and block size to values appropriate to the - * mechanism - */ - switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; - sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - if (ctx_template != NULL) { - /* reuse context template */ - memcpy(&sha2_hmac_ctx, ctx_template, sizeof (sha2_hmac_ctx_t)); - } else { - sha2_hmac_ctx.hc_mech_type = mechanism->cm_type; - /* no context template, initialize context */ - if (keylen_in_bytes > sha_hmac_block_size) { - /* - * Hash the passed-in key to get a smaller key. - * The inner context is used since it hasn't been - * initialized yet. - */ - PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, - &sha2_hmac_ctx.hc_icontext, - key->ck_data, keylen_in_bytes, digest); - sha2_mac_init_ctx(&sha2_hmac_ctx, digest, - sha_digest_len); - } else { - sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data, - keylen_in_bytes); - } - } - - /* get the mechanism parameters, if applicable */ - if ((mechanism->cm_type % 3) == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); - if (digest_len > sha_digest_len) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - } - - /* do a SHA2 update of the inner context using the specified data */ - SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret); - if (ret != CRYPTO_SUCCESS) - /* the update failed, free context and bail */ - goto bail; - - /* - * Do a SHA2 final on the inner context. - */ - SHA2Final(digest, &sha2_hmac_ctx.hc_icontext); - - /* - * Do an SHA2 update on the outer context, feeding the inner - * digest as data. - * - * HMAC-SHA384 needs special handling as the outer hash needs only 48 - * bytes of the inner hash value. - */ - if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || - mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, - SHA384_DIGEST_LENGTH); - else - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); - - /* - * Do a SHA2 final on the outer context, storing the computed - * digest in the users buffer. - */ - switch (mac->cd_format) { - case CRYPTO_DATA_RAW: - if (digest_len != sha_digest_len) { - /* - * The caller requested a short digest. Digest - * into a scratch buffer and return to - * the user only what was requested. - */ - SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext); - memcpy((unsigned char *)mac->cd_raw.iov_base + - mac->cd_offset, digest, digest_len); - } else { - SHA2Final((unsigned char *)mac->cd_raw.iov_base + - mac->cd_offset, &sha2_hmac_ctx.hc_ocontext); - } - break; - case CRYPTO_DATA_UIO: - ret = sha2_digest_final_uio(&sha2_hmac_ctx.hc_ocontext, mac, - digest_len, digest); - break; - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - if (ret == CRYPTO_SUCCESS) { - mac->cd_length = digest_len; - return (CRYPTO_SUCCESS); - } -bail: - memset(&sha2_hmac_ctx, 0, sizeof (sha2_hmac_ctx_t)); - mac->cd_length = 0; - return (ret); -} - -static int -sha2_mac_verify_atomic(crypto_mechanism_t *mechanism, - crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, - crypto_spi_ctx_template_t ctx_template) -{ - int ret = CRYPTO_SUCCESS; - uchar_t digest[SHA512_DIGEST_LENGTH]; - sha2_hmac_ctx_t sha2_hmac_ctx; - uint32_t sha_digest_len, digest_len, sha_hmac_block_size; - uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); - - /* - * Set the digest length and block size to values appropriate to the - * mechanism - */ - switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; - sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - if (ctx_template != NULL) { - /* reuse context template */ - memcpy(&sha2_hmac_ctx, ctx_template, sizeof (sha2_hmac_ctx_t)); - } else { - sha2_hmac_ctx.hc_mech_type = mechanism->cm_type; - /* no context template, initialize context */ - if (keylen_in_bytes > sha_hmac_block_size) { - /* - * Hash the passed-in key to get a smaller key. - * The inner context is used since it hasn't been - * initialized yet. - */ - PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, - &sha2_hmac_ctx.hc_icontext, - key->ck_data, keylen_in_bytes, digest); - sha2_mac_init_ctx(&sha2_hmac_ctx, digest, - sha_digest_len); - } else { - sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data, - keylen_in_bytes); - } - } - - /* get the mechanism parameters, if applicable */ - if (mechanism->cm_type % 3 == 2) { - if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); - if (digest_len > sha_digest_len) { - ret = CRYPTO_MECHANISM_PARAM_INVALID; - goto bail; - } - } - - if (mac->cd_length != digest_len) { - ret = CRYPTO_INVALID_MAC; - goto bail; - } - - /* do a SHA2 update of the inner context using the specified data */ - SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret); - if (ret != CRYPTO_SUCCESS) - /* the update failed, free context and bail */ - goto bail; - - /* do a SHA2 final on the inner context */ - SHA2Final(digest, &sha2_hmac_ctx.hc_icontext); - - /* - * Do an SHA2 update on the outer context, feeding the inner - * digest as data. - * - * HMAC-SHA384 needs special handling as the outer hash needs only 48 - * bytes of the inner hash value. - */ - if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || - mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, - SHA384_DIGEST_LENGTH); - else - SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); - - /* - * Do a SHA2 final on the outer context, storing the computed - * digest in the users buffer. - */ - SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext); - - /* - * Compare the computed digest against the expected digest passed - * as argument. - */ - - switch (mac->cd_format) { - - case CRYPTO_DATA_RAW: - if (memcmp(digest, (unsigned char *)mac->cd_raw.iov_base + - mac->cd_offset, digest_len) != 0) - ret = CRYPTO_INVALID_MAC; - break; - - case CRYPTO_DATA_UIO: { - off_t offset = mac->cd_offset; - uint_t vec_idx = 0; - off_t scratch_offset = 0; - size_t length = digest_len; - size_t cur_len; - - /* we support only kernel buffer */ - if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE) - return (CRYPTO_ARGUMENTS_BAD); - - /* jump to the first iovec containing the expected digest */ - offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx); - if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) { - /* - * The caller specified an offset that is - * larger than the total size of the buffers - * it provided. - */ - ret = CRYPTO_DATA_LEN_RANGE; - break; - } - - /* do the comparison of computed digest vs specified one */ - while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) { - cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) - - offset, length); - - if (memcmp(digest + scratch_offset, - zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset, - cur_len) != 0) { - ret = CRYPTO_INVALID_MAC; - break; - } - - length -= cur_len; - vec_idx++; - scratch_offset += cur_len; - offset = 0; - } - break; - } - - default: - ret = CRYPTO_ARGUMENTS_BAD; - } - - return (ret); -bail: - memset(&sha2_hmac_ctx, 0, sizeof (sha2_hmac_ctx_t)); - mac->cd_length = 0; - return (ret); -} - -/* - * KCF software provider context management entry points. - */ - -static int -sha2_create_ctx_template(crypto_mechanism_t *mechanism, crypto_key_t *key, - crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size) -{ - sha2_hmac_ctx_t *sha2_hmac_ctx_tmpl; - uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); - uint32_t sha_digest_len, sha_hmac_block_size; - - /* - * Set the digest length and block size to values appropriate to the - * mechanism - */ - switch (mechanism->cm_type) { - case SHA256_HMAC_MECH_INFO_TYPE: - case SHA256_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA256_DIGEST_LENGTH; - sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; - break; - case SHA384_HMAC_MECH_INFO_TYPE: - case SHA384_HMAC_GEN_MECH_INFO_TYPE: - case SHA512_HMAC_MECH_INFO_TYPE: - case SHA512_HMAC_GEN_MECH_INFO_TYPE: - sha_digest_len = SHA512_DIGEST_LENGTH; - sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; - break; - default: - return (CRYPTO_MECHANISM_INVALID); - } - - /* - * Allocate and initialize SHA2 context. - */ - sha2_hmac_ctx_tmpl = kmem_alloc(sizeof (sha2_hmac_ctx_t), KM_SLEEP); - if (sha2_hmac_ctx_tmpl == NULL) - return (CRYPTO_HOST_MEMORY); - - sha2_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type; - - if (keylen_in_bytes > sha_hmac_block_size) { - uchar_t digested_key[SHA512_DIGEST_LENGTH]; - - /* - * Hash the passed-in key to get a smaller key. - * The inner context is used since it hasn't been - * initialized yet. - */ - PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, - &sha2_hmac_ctx_tmpl->hc_icontext, - key->ck_data, keylen_in_bytes, digested_key); - sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, digested_key, - sha_digest_len); - } else { - sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, key->ck_data, - keylen_in_bytes); - } - - *ctx_template = (crypto_spi_ctx_template_t)sha2_hmac_ctx_tmpl; - *ctx_template_size = sizeof (sha2_hmac_ctx_t); - - return (CRYPTO_SUCCESS); -} - -static int -sha2_free_context(crypto_ctx_t *ctx) -{ - uint_t ctx_len; - - if (ctx->cc_provider_private == NULL) - return (CRYPTO_SUCCESS); - - /* - * We have to free either SHA2 or SHA2-HMAC contexts, which - * have different lengths. - * - * Note: Below is dependent on the mechanism ordering. - */ - - if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0) - ctx_len = sizeof (sha2_ctx_t); - else - ctx_len = sizeof (sha2_hmac_ctx_t); - - memset(ctx->cc_provider_private, 0, ctx_len); - kmem_free(ctx->cc_provider_private, ctx_len); - ctx->cc_provider_private = NULL; - - return (CRYPTO_SUCCESS); -} diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 0410ddd65a5c..9e8f48c334c5 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1162,12 +1162,12 @@ zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ - SHA2Init(SHA512, &ctx); + SHA2Init(&ctx, SHA512); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } - SHA2Final(digestbuf, &ctx); + SHA2Final(&ctx, digestbuf); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index dcab02b07894..0f3237c8eb7c 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1316,12 +1316,12 @@ zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ - SHA2Init(SHA512, &ctx); + SHA2Init(&ctx, SHA512); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } - SHA2Final(digestbuf, &ctx); + SHA2Final(&ctx, digestbuf); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); diff --git a/module/zfs/sha2_zfs.c b/module/zfs/sha2_zfs.c index 445d82ed020c..12065f5a1b8c 100644 --- a/module/zfs/sha2_zfs.c +++ b/module/zfs/sha2_zfs.c @@ -18,17 +18,16 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ + #include -#include #include + #include #include #include @@ -42,7 +41,7 @@ sha_incremental(void *buf, size_t size, void *arg) } void -abd_checksum_SHA256(abd_t *abd, uint64_t size, +abd_checksum_sha256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; @@ -56,13 +55,12 @@ abd_checksum_SHA256(abd_t *abd, uint64_t size, abd_return_buf(abd, buf, size); if (ret == CPA_STATUS_SUCCESS) goto bswap; - - /* If the hardware implementation fails fall back to software */ + /* If the qat implementation fails - fall back to software */ } - SHA2Init(SHA256, &ctx); + SHA2Init(&ctx, SHA256); (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); - SHA2Final(&tmp, &ctx); + SHA2Final(&ctx, &tmp); bswap: /* @@ -79,24 +77,24 @@ abd_checksum_SHA256(abd_t *abd, uint64_t size, } void -abd_checksum_SHA512_native(abd_t *abd, uint64_t size, +abd_checksum_sha512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; - SHA2_CTX ctx; + SHA2_CTX ctx; - SHA2Init(SHA512_256, &ctx); + SHA2Init(&ctx, SHA512_256); (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); - SHA2Final(zcp, &ctx); + SHA2Final(&ctx, zcp); } void -abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, +abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { - zio_cksum_t tmp; + zio_cksum_t tmp; - abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); + abd_checksum_sha512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index b9dc907afa8d..2b7402fe6d1a 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -23,13 +23,13 @@ * Copyright (c) 2021-2022 Tino Reichardt */ -#include -#include #include #include #include +#include #include +#include /* limit benchmarking to max 256KiB, when EdonR is slower then this: */ #define LIMIT_PERF_MBS 300 @@ -237,24 +237,32 @@ chksum_benchit(chksum_stat_t *cs) static void chksum_benchmark(void) { - #ifndef _KERNEL /* we need the benchmark only for the kernel module */ return; #endif chksum_stat_t *cs; - int cbid = 0, id; uint64_t max = 0; + uint32_t id, cid = 0, id_save; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); + const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); + const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + + /* count implementations */ + chksum_stat_cnt = 2; + chksum_stat_cnt += sha256->getcnt(); + chksum_stat_cnt += sha512->getcnt(); + chksum_stat_cnt += blake3->getcnt(); /* space for the benchmark times */ - chksum_stat_cnt = 4; - chksum_stat_cnt += blake3_get_impl_count(); chksum_stat_data = (chksum_stat_t *)kmem_zalloc( sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); /* edonr - needs to be the first one here (slow CPU check) */ - cs = &chksum_stat_data[cbid++]; + cs = &chksum_stat_data[cid++]; + + /* edonr */ cs->init = abd_checksum_edonr_tmpl_init; cs->func = abd_checksum_edonr_native; cs->free = abd_checksum_edonr_tmpl_free; @@ -263,7 +271,7 @@ chksum_benchmark(void) chksum_benchit(cs); /* skein */ - cs = &chksum_stat_data[cbid++]; + cs = &chksum_stat_data[cid++]; cs->init = abd_checksum_skein_tmpl_init; cs->func = abd_checksum_skein_native; cs->free = abd_checksum_skein_tmpl_free; @@ -272,38 +280,58 @@ chksum_benchmark(void) chksum_benchit(cs); /* sha256 */ - cs = &chksum_stat_data[cbid++]; - cs->init = 0; - cs->func = abd_checksum_SHA256; - cs->free = 0; - cs->name = "sha256"; - cs->impl = "generic"; - chksum_benchit(cs); + id_save = sha256->getid(); + for (id = 0; id < sha256->getcnt(); id++) { + sha256->setid(id); + cs = &chksum_stat_data[cid++]; + cs->init = 0; + cs->func = abd_checksum_sha256; + cs->free = 0; + cs->name = sha256->name; + cs->impl = sha256->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha256->set_fastest(id); + } + } + sha256->setid(id_save); /* sha512 */ - cs = &chksum_stat_data[cbid++]; - cs->init = 0; - cs->func = abd_checksum_SHA512_native; - cs->free = 0; - cs->name = "sha512"; - cs->impl = "generic"; - chksum_benchit(cs); + id_save = sha512->getid(); + for (id = 0; id < sha512->getcnt(); id++) { + sha512->setid(id); + cs = &chksum_stat_data[cid++]; + cs->init = 0; + cs->func = abd_checksum_sha512_native; + cs->free = 0; + cs->name = sha512->name; + cs->impl = sha512->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha512->set_fastest(id); + } + } + sha512->setid(id_save); /* blake3 */ - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); - cs = &chksum_stat_data[cbid++]; + id_save = blake3->getid(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + cs = &chksum_stat_data[cid++]; cs->init = abd_checksum_blake3_tmpl_init; cs->func = abd_checksum_blake3_native; cs->free = abd_checksum_blake3_tmpl_free; - cs->name = "blake3"; - cs->impl = blake3_get_impl_name(); + cs->name = blake3->name; + cs->impl = blake3->getname(); chksum_benchit(cs); if (cs->bs256k > max) { max = cs->bs256k; - blake3_set_impl_fastest(id); + blake3->set_fastest(id); } } + blake3->setid(id_save); } void @@ -329,9 +357,6 @@ chksum_init(void) chksum_kstat_addr); kstat_install(chksum_kstat); } - - /* setup implementations */ - blake3_setup_impl(); } void diff --git a/module/zfs/zfs_impl.c b/module/zfs/zfs_impl.c new file mode 100644 index 000000000000..0bd2b66c7ee5 --- /dev/null +++ b/module/zfs/zfs_impl.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include + +#include +#include + +/* + * impl_ops - backend for implementations of algorithms + */ +const zfs_impl_t *impl_ops[] = { + &zfs_blake3_ops, + &zfs_sha256_ops, + &zfs_sha512_ops, + NULL +}; + +/* + * zfs_impl_get_ops - Get the API functions for an impl backend + */ +const zfs_impl_t * +zfs_impl_get_ops(const char *algo) +{ + const zfs_impl_t **ops = impl_ops; + + if (!algo || !*algo) + return (*ops); + + for (; *ops; ops++) { + if (strcmp(algo, (*ops)->name) == 0) + break; + } + + return (*ops); +} diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index b3c5fbbd8bba..87aa64ad3aa2 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -165,10 +165,10 @@ const zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "on"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, @@ -177,14 +177,14 @@ const zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { NULL, NULL, 0, "fletcher2"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c index d57d0e047f01..aebe0363cc6e 100644 --- a/tests/zfs-tests/cmd/checksum/blake3_test.c +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -31,6 +31,8 @@ #include #include +#include + /* * set it to a define for debugging */ @@ -485,10 +487,14 @@ main(int argc, char *argv[]) uint8_t buffer[102400]; uint64_t cpu_mhz = 0; int id, i, j; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); if (argc == 2) cpu_mhz = atoi(argv[1]); + if (!blake3) + return (1); + /* fill test message */ for (i = 0, j = 0; i < sizeof (buffer); i++, j++) { if (j == 251) @@ -497,9 +503,9 @@ main(int argc, char *argv[]) } (void) printf("Running algorithm correctness tests:\n"); - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); - const char *name = blake3_get_impl_name(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); dprintf("Result for BLAKE3-%s:\n", name); for (i = 0; TestArray[i].hash; i++) { blake3_test_t *cur = &TestArray[i]; @@ -565,9 +571,9 @@ main(int argc, char *argv[]) } while (0) printf("Running performance tests (hashing 1024 MiB of data):\n"); - for (id = 0; id < blake3_get_impl_count(); id++) { - blake3_set_impl_id(id); - const char *name = blake3_get_impl_name(); + for (id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + const char *name = blake3->getname(); BLAKE3_PERF_TEST(name, 256); } diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c index bb355311091e..6c90eb66ffe5 100644 --- a/tests/zfs-tests/cmd/checksum/sha2_test.c +++ b/tests/zfs-tests/cmd/checksum/sha2_test.c @@ -34,10 +34,9 @@ #include #include #include -#define _SHA2_IMPL -#include #include - +#include +#include /* * Test messages from: @@ -173,17 +172,27 @@ main(int argc, char *argv[]) { boolean_t failed = B_FALSE; uint64_t cpu_mhz = 0; + uint32_t id; + + const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); + const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); if (argc == 2) cpu_mhz = atoi(argv[1]); + if (!sha256) + return (1); + + if (!sha512) + return (1); + #define SHA2_ALGO_TEST(_m, mode, diglen, testdigest) \ do { \ SHA2_CTX ctx; \ uint8_t digest[diglen / 8]; \ - SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx); \ + SHA2Init(&ctx, mode); \ SHA2Update(&ctx, _m, strlen(_m)); \ - SHA2Final(digest, &ctx); \ + SHA2Final(&ctx, digest); \ (void) printf("SHA%-9sMessage: " #_m \ "\tResult: ", #mode); \ if (memcmp(digest, testdigest, diglen / 8) == 0) { \ @@ -194,7 +203,7 @@ main(int argc, char *argv[]) } \ } while (0) -#define SHA2_PERF_TEST(mode, diglen) \ +#define SHA2_PERF_TEST(mode, diglen, name) \ do { \ SHA2_CTX ctx; \ uint8_t digest[diglen / 8]; \ @@ -205,10 +214,10 @@ main(int argc, char *argv[]) struct timeval start, end; \ memset(block, 0, sizeof (block)); \ (void) gettimeofday(&start, NULL); \ - SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx); \ + SHA2Init(&ctx, mode); \ for (i = 0; i < 8192; i++) \ SHA2Update(&ctx, block, sizeof (block)); \ - SHA2Final(digest, &ctx); \ + SHA2Final(&ctx, digest); \ (void) gettimeofday(&end, NULL); \ delta = (end.tv_sec * 1000000llu + end.tv_usec) - \ (start.tv_sec * 1000000llu + start.tv_usec); \ @@ -216,29 +225,38 @@ main(int argc, char *argv[]) cpb = (cpu_mhz * 1e6 * ((double)delta / \ 1000000)) / (8192 * 128 * 1024); \ } \ - (void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode, \ - (u_longlong_t)delta, cpb); \ + (void) printf("SHA%-9s%llu us (%s %.02f CPB)\n", #mode, \ + (u_longlong_t)delta, name, cpb); \ } while (0) (void) printf("Running algorithm correctness tests:\n"); - SHA2_ALGO_TEST(test_msg0, 256, 256, sha256_test_digests[0]); - SHA2_ALGO_TEST(test_msg1, 256, 256, sha256_test_digests[1]); - SHA2_ALGO_TEST(test_msg0, 384, 384, sha384_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 384, 384, sha384_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512, 512, sha512_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512, 512, sha512_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512_224, 224, sha512_224_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512_224, 224, sha512_224_test_digests[2]); - SHA2_ALGO_TEST(test_msg0, 512_256, 256, sha512_256_test_digests[0]); - SHA2_ALGO_TEST(test_msg2, 512_256, 256, sha512_256_test_digests[2]); + SHA2_ALGO_TEST(test_msg0, SHA256, 256, sha256_test_digests[0]); + SHA2_ALGO_TEST(test_msg1, SHA256, 256, sha256_test_digests[1]); + SHA2_ALGO_TEST(test_msg0, SHA384, 384, sha384_test_digests[0]); + SHA2_ALGO_TEST(test_msg2, SHA384, 384, sha384_test_digests[2]); + + SHA2_ALGO_TEST(test_msg0, SHA512, 512, sha512_test_digests[0]); + SHA2_ALGO_TEST(test_msg2, SHA512, 512, sha512_test_digests[2]); + SHA2_ALGO_TEST(test_msg0, SHA512_256, 256, sha512_256_test_digests[0]); + SHA2_ALGO_TEST(test_msg2, SHA512_256, 256, sha512_256_test_digests[2]); if (failed) return (1); (void) printf("Running performance tests (hashing 1024 MiB of " "data):\n"); - SHA2_PERF_TEST(256, 256); - SHA2_PERF_TEST(512, 512); + + for (id = 0; id < sha256->getcnt(); id++) { + sha256->setid(id); + const char *name = sha256->getname(); + SHA2_PERF_TEST(SHA256, 256, name); + } + + for (id = 0; id < sha512->getcnt(); id++) { + sha512->setid(id); + const char *name = sha512->getname(); + SHA2_PERF_TEST(SHA512, 512, name); + } return (0); }