Skip to content

Commit

Permalink
Add Windows assembler: gcm_pclmulqdq
Browse files Browse the repository at this point in the history
  • Loading branch information
lundman committed Oct 21, 2022
1 parent 7f944e0 commit f4dc7f9
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 3 deletions.
6 changes: 4 additions & 2 deletions include/os/windows/spl/sys/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ xgetbv(uint32_t c)

#endif

#define CPUID_FEATURE_AES (1<<25)
#define CPUID_FEATURE_XSAVE (1<<26)
#define CPUID_FEATURE_PCLMULQDQ (1<<1)
#define CPUID_FEATURE_AES (1<<25)
#define CPUID_FEATURE_XSAVE (1<<26)
//#define CPUID_FEATURE_AVX (1<<28)

extern uint64_t spl_cpuid_features(void);
extern uint64_t spl_cpuid_leaf7_features(void);
Expand Down
3 changes: 3 additions & 0 deletions include/os/windows/zfs/zfs_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@

#define HAVE_USLEEP 1

/* These control which assembler files to use */
//#define HAVE_AVX 1
#define HAVE_PCLMULQDQ 1
#define HAVE_AES 1

/* Path where the kernel module is installed. */
Expand Down
1 change: 1 addition & 0 deletions lib/libicp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ add_library(libicp
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/aes/aes_amd64.S"
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha256_impl.S"
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/sha2/sha512_impl.S"
"${ICP_MODULE_DIR}/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S"
)

# Add windows/assembler sources here too.
Expand Down
3 changes: 2 additions & 1 deletion module/icp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ wdk_add_library(icpkern
algs/modes/ecb.c
algs/modes/gcm.c
algs/modes/gcm_generic.c
algs/modes/gcm_pclmulqdq.c
algs/modes/modes.c
algs/sha2/sha2.c
algs/skein/skein.c
Expand All @@ -33,7 +34,7 @@ wdk_add_library(icpkern
asm-x86_64/os/windows/aes/aes_aesni.S
asm-x86_64/os/windows/aes/aes_amd64.S
asm-x86_64/modes/aesni-gcm-x86_64.S
asm-x86_64/modes/gcm_pclmulqdq.S
asm-x86_64/os/windows/modes/gcm_pclmulqdq.S
asm-x86_64/modes/ghash-x86_64.S
# asm-x86_64/sha1/sha1-x86_64.S
asm-x86_64/os/windows/sha2/sha256_impl.S
Expand Down
267 changes: 267 additions & 0 deletions module/icp/asm-x86_64/os/windows/modes/gcm_pclmulqdq.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2009 Intel Corporation
* All Rights Reserved.
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/

/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains an accelerated
* Galois Field Multiplication implementation.
*
* PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
* carry-less multiplication. More information about PCLMULQDQ can be
* found at:
* http://software.intel.com/en-us/articles/
* carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
*
*/

/*
* ====================================================================
* OpenSolaris OS modifications
*
* This source originates as file galois_hash_asm.c from
* Intel Corporation dated September 21, 2009.
*
* This OpenSolaris version has these major changes from the original source:
*
* 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
* /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
* definition for lint.
*
* 2. Formatted code, added comments, and added #includes and #defines.
*
* 3. If bit CR0.TS is set, clear and set the TS bit, after and before
* calling kpreempt_disable() and kpreempt_enable().
* If the TS bit is not set, Save and restore %xmm registers at the beginning
* and end of function calls (%xmm* registers are not saved and restored by
* during kernel thread preemption).
*
* 4. Removed code to perform hashing. This is already done with C macro
* GHASH in gcm.c. For better performance, this removed code should be
* reintegrated in the future to replace the C GHASH macro.
*
* 5. Added code to byte swap 16-byte input and output.
*
* 6. Folded in comments from the original C source with embedded assembly
* (SB_w_shift_xor.c)
*
* 7. Renamed function and reordered parameters to match OpenSolaris:
* Intel interface:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
* OpenSolaris OS interface:
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
* ====================================================================
*/


#if defined(lint) || defined(__lint) /* lint */

#include <sys/types.h>

void
gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
(void) x_in, (void) y, (void) res;
}

#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */

#define _ASM
#include <sys/asm_linkage.h>

/*
* Use this mask to byte-swap a 16-byte integer with the pshufb instruction
*/

// static uint8_t byte_swap16_mask[] = {
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
.section .rodata
.align XMM_ALIGN
.Lbyte_swap16_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0


/*
* void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
*
* Perform a carry-less multiplication (that is, use XOR instead of the
* multiply operator) on P1 and P2 and place the result in P3.
*
* Byte swap the input and the output.
*
* Note: x_in, y, and res all point to a block of 20-byte numbers
* (an array of two 64-bit integers).
*
* Note2: For kernel code, caller is responsible for ensuring
* kpreempt_disable() has been called. This is because %xmm registers are
* not saved/restored. Clear and set the CR0.TS bit on entry and exit,
* respectively, if TS is set on entry. Otherwise, if TS is not set,
* save and restore %xmm registers on the stack.
*
* Note3: Original Intel definition:
* void galois_hash_asm(unsigned char *hk, unsigned char *s,
* unsigned char *d, int length)
*
* Note4: Register/parameter mapping:
* Intel:
* Parameter 1: %rcx (copied to %xmm0) hk or x_in
* Parameter 2: %rdx (copied to %xmm1) s or y
* Parameter 3: %rdi (result) d or res
* OpenSolaris:
* Parameter 1: %rdi (copied to %xmm0) x_in
* Parameter 2: %rsi (copied to %xmm1) y
* Parameter 3: %rdx (result) res
*/
// Windows x64:
// Calling: rcx, rdx, r8, and r9 (float: xmm0-xmm3)
// Return: rax (float: xmm0)
// Volatile: rax, rcx, rdx, r8-r11
// Nonvolatile: rbx, rbp, rsp, rdi, rsi, r12-r15 (xmm6, xmm15)

// Unix x64:
// Calling: rdi, rsi, rdx, rcx, r8, r9 (float: xmm0-xmm7)
// Return: rax (float: xmm0)
// Volatile:
// Nonvolatile: rbx, rbp, rsp, r12-r15

// outcome:

ENTRY_NP(gcm_mul_pclmulqdq)
//
// Copy Parameters
//
movdqu (%rcx), %xmm0 // P1
movdqu (%rdx), %xmm1 // P2

//
// Byte swap 16-byte input
//
lea .Lbyte_swap16_mask(%rip), %rax
movups (%rax), %xmm10
pshufb %xmm10, %xmm0
pshufb %xmm10, %xmm1


//
// Multiply with the hash key
//
movdqu %xmm0, %xmm3
pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0

movdqu %xmm0, %xmm4
pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1

movdqu %xmm0, %xmm5
pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0
movdqu %xmm0, %xmm6
pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1

pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0

movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5
psrldq $8, %xmm4 // shift by xmm4 64 bits to the right
pslldq $8, %xmm5 // shift by xmm5 64 bits to the left
pxor %xmm5, %xmm3
pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result
// of the carry-less multiplication of
// xmm0 by xmm1.

// We shift the result of the multiplication by one bit position
// to the left to cope for the fact that the bits are reversed.
movdqu %xmm3, %xmm7
movdqu %xmm6, %xmm8
pslld $1, %xmm3
pslld $1, %xmm6
psrld $31, %xmm7
psrld $31, %xmm8
movdqu %xmm7, %xmm9
pslldq $4, %xmm8
pslldq $4, %xmm7
psrldq $12, %xmm9
por %xmm7, %xmm3
por %xmm8, %xmm6
por %xmm9, %xmm6

//
// First phase of the reduction
//
// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
// independently.
movdqu %xmm3, %xmm7
movdqu %xmm3, %xmm8
movdqu %xmm3, %xmm9
pslld $31, %xmm7 // packed right shift shifting << 31
pslld $30, %xmm8 // packed right shift shifting << 30
pslld $25, %xmm9 // packed right shift shifting << 25
pxor %xmm8, %xmm7 // xor the shifted versions
pxor %xmm9, %xmm7
movdqu %xmm7, %xmm8
pslldq $12, %xmm7
psrldq $4, %xmm8
pxor %xmm7, %xmm3 // first phase of the reduction complete

//
// Second phase of the reduction
//
// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
// shift operations.
movdqu %xmm3, %xmm2
movdqu %xmm3, %xmm4 // packed left shifting >> 1
movdqu %xmm3, %xmm5
psrld $1, %xmm2
psrld $2, %xmm4 // packed left shifting >> 2
psrld $7, %xmm5 // packed left shifting >> 7
pxor %xmm4, %xmm2 // xor the shifted versions
pxor %xmm5, %xmm2
pxor %xmm8, %xmm2
pxor %xmm2, %xmm3
pxor %xmm3, %xmm6 // the result is in xmm6

//
// Byte swap 16-byte result
//
pshufb %xmm10, %xmm6 // %xmm10 has the swap mask

//
// Store the result
//
movdqu %xmm6, (%r8) // P3


//
// Return
//
RET
SET_SIZE(gcm_mul_pclmulqdq)

#endif /* lint || __lint */

#ifdef __ELF__
.section .note.GNU-stack,"",%progbits
#endif

0 comments on commit f4dc7f9

Please sign in to comment.