diff --git a/include/os/windows/spl/sys/simd.h b/include/os/windows/spl/sys/simd.h index 939ca4e0d732..ecbdab9bdd79 100644 --- a/include/os/windows/spl/sys/simd.h +++ b/include/os/windows/spl/sys/simd.h @@ -92,8 +92,10 @@ xgetbv(uint32_t c) #endif -#define CPUID_FEATURE_AES (1<<25) -#define CPUID_FEATURE_XSAVE (1<<26) +#define CPUID_FEATURE_PCLMULQDQ (1<<1) +#define CPUID_FEATURE_AES (1<<25) +#define CPUID_FEATURE_XSAVE (1<<26) +//#define CPUID_FEATURE_AVX (1<<28) extern uint64_t spl_cpuid_features(void); extern uint64_t spl_cpuid_leaf7_features(void); diff --git a/include/os/windows/zfs/zfs_config.h b/include/os/windows/zfs/zfs_config.h index b28eb74fab07..09d76d2330f2 100644 --- a/include/os/windows/zfs/zfs_config.h +++ b/include/os/windows/zfs/zfs_config.h @@ -63,6 +63,9 @@ #define HAVE_USLEEP 1 +/* These control which assembler files to use */ +//#define HAVE_AVX 1 +#define HAVE_PCLMULQDQ 1 #define HAVE_AES 1 /* Path where the kernel module is installed. */ diff --git a/lib/libicp/CMakeLists.txt b/lib/libicp/CMakeLists.txt index ba445d132c30..e7a71aed06b9 100644 --- a/lib/libicp/CMakeLists.txt +++ b/lib/libicp/CMakeLists.txt @@ -45,6 +45,7 @@ add_library(libicp "${ICP_MODULE_DIR}/asm-x86_64/os/windows/aes/aes_amd64.S" "${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha256_impl.S" "${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha512_impl.S" + "${ICP_MODULE_DIR}/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S" ) # Add windows/assembler sources here too. diff --git a/module/icp/CMakeLists.txt b/module/icp/CMakeLists.txt index 11bf8e990704..5fc885c6845b 100644 --- a/module/icp/CMakeLists.txt +++ b/module/icp/CMakeLists.txt @@ -18,6 +18,7 @@ wdk_add_library(icpkern algs/modes/ecb.c algs/modes/gcm.c algs/modes/gcm_generic.c + algs/modes/gcm_pclmulqdq.c algs/modes/modes.c algs/sha2/sha2.c algs/skein/skein.c @@ -33,7 +34,7 @@ wdk_add_library(icpkern asm-x86_64/os/windows/aes/aes_aesni.S asm-x86_64/os/windows/aes/aes_amd64.S asm-x86_64/modes/aesni-gcm-x86_64.S - asm-x86_64/modes/gcm_pclmulqdq.S + asm-x86_64/os/windows/modes/gcm_pclmulqdq.S asm-x86_64/modes/ghash-x86_64.S # asm-x86_64/sha1/sha1-x86_64.S asm-x86_64/os/windows/sha2/sha256_impl.S diff --git a/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S new file mode 100644 index 000000000000..1e82258a8621 --- /dev/null +++ b/module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S @@ -0,0 +1,267 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 5. Added code to byte swap 16-byte input and output. + * + * 6. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 7. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) /* lint */ + +#include + +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { + (void) x_in, (void) y, (void) res; +} + +#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ + +#define _ASM +#include + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.section .rodata +.align XMM_ALIGN +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ +// Windows x64: +// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3) +// Return: rax (float: xmm0) +// Volatile: rax, rcx, rdx, r8-r11 +// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15) + +// Unix x64: +// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7) +// Return: rax (float: xmm0) +// Volatile: +// Nonvolatile: rbx, rbp, rsp, r12-r15 + +// outcome: + +ENTRY_NP(gcm_mul_pclmulqdq) + // + // Copy Parameters + // + movdqu (%rcx), %xmm0 // P1 + movdqu (%rdx), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movups (%rax), %xmm10 + pshufb %xmm10, %xmm0 + pshufb %xmm10, %xmm1 + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + + movdqu %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + + movdqu %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + movdqu %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + + // + // Store the result + // + movdqu %xmm6, (%r8) // P3 + + + // + // Return + // + RET + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif