-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AVX512 support, new PR to replace old branch from contributor. We need CI to run so moving it here. #72
base: main
Are you sure you want to change the base?
AVX512 support, new PR to replace old branch from contributor. We need CI to run so moving it here. #72
Changes from 20 commits
c6248e0
cf22bca
def3a68
375fa35
9e18b50
469ef71
2eb5578
837d5a1
2289c96
ee3e5da
005ed7c
39094d4
d4ffdc1
bf79936
1e24d06
907e721
5ab0046
ca43c51
28dde8b
a00a8e3
5138407
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
|
||
#include <aws/checksums/private/crc_priv.h> | ||
|
||
#include <aws/common/config.h> | ||
#include <nmmintrin.h> | ||
|
||
#if _WIN64 || __x86_64__ || __ppc64_ | ||
typedef uint64_t *slice_ptr_type; | ||
typedef uint64_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u64 | ||
#else | ||
typedef uint32_t *slice_ptr_type; | ||
typedef uint32_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u32 | ||
#endif | ||
|
||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc); | ||
#endif | ||
|
||
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
#include <aws/checksums/private/intel/crc32c_compiler_shims.h> | ||
#include <aws/common/cpuid.h> | ||
|
||
static bool detection_performed = false; | ||
static bool detected_sse42 = false; | ||
static bool detected_avx512 = false; | ||
static bool detected_clmul = false; | ||
static bool detected_vpclmulqdq = false; | ||
|
||
/* | ||
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and | ||
* PCLMULQDQ machine instructions (if present). | ||
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. | ||
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent | ||
* call. | ||
*/ | ||
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
|
||
if (AWS_UNLIKELY(!detection_performed)) { | ||
detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); | ||
detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); | ||
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); | ||
detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ); | ||
|
||
/* Simply setting the flag true to skip HW detection next time | ||
Not using memory barriers since the worst that can | ||
happen is a fallback to the non HW accelerated code. */ | ||
detection_performed = true; | ||
} | ||
|
||
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and | ||
* branches.*/ | ||
uint32_t crc = ~previousCrc32; | ||
|
||
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ | ||
if (length < (int)sizeof(slice_ptr_int_type)) { | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input++); | ||
} | ||
return ~crc; | ||
} | ||
|
||
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ | ||
int input_alignment = (uintptr_t)(input)&0x7; | ||
|
||
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ | ||
int leading = (8 - input_alignment) & 0x7; | ||
|
||
/* reduce the length by the leading unaligned bytes we are about to process */ | ||
length -= leading; | ||
|
||
/* spin through the leading unaligned input bytes (if any) one-by-one */ | ||
while (leading-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input++); | ||
} | ||
|
||
#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) | ||
int chunk_size = length & ~63; | ||
|
||
if (detected_avx512 && detected_vpclmulqdq && detected_clmul) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say detect_clmul is not needed, as it will always be true if detected_vpclmulqdq is true |
||
if (length >= 256) { | ||
crc = aws_checksums_crc32c_avx512(input, length, crc); | ||
/* check remaining data */ | ||
length -= chunk_size; | ||
if (!length) { | ||
return ~crc; | ||
} | ||
|
||
/* Fall into the default crc32 for the remaining data. */ | ||
input += chunk_size; | ||
} | ||
} | ||
#endif | ||
|
||
if (detected_sse42 && detected_clmul) { | ||
return aws_checksums_crc32c_sse42(input, length, crc); | ||
} | ||
Comment on lines
+79
to
+81
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will the sse42 impl be worth invoking after we've finished processing the lead portion using the avx512 implementation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the comments. I will review them closely. For full disclosure - I re-used the avx512 intrinsic implementation from https://chromium.googlesource.com/chromium/src/third_party/zlib/+/b890619bc2b193b8fbe9c1c053f4cd19a9791d92/crc32_simd.c but recomputed constants for crc32c polynomial :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @pbadari I've got the build surgery done and tests passing if you'd like to work on the avx512 comments from bdonlan@ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you'll need the AVX512 branch from aws-c-common until we merge it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I am reviewing the avx512 comments from bdonlan There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Jonathan, Most of the review comments (from Donlan) for the AVX512 code is further performance improvements which require careful re-write/proto-type and performance analysis. I reached out to our area expert for his input. I am wondering if we can merge the current patch and update it further when we have new code ready? Please let me know. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, AVX512 has a history of being an incredibly risky addition to an already functioning (possibly already running really hot) system. Various chipset versions introduce timing issues for side-channel attacks as well as side-effects to other processes sharing the CPU. So we're going to have to run a lot of tests before we can just run this in production anyways, and we'd like the code to be close to structured in the actual final algorithm before running that analysis. This particular code runs for a lot of S3 PUT and GET operations across multiple SDKs, so any side-effects would most likely be felt across a large blast radius. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are submitting AVX-512 based implementation of crc32 to zlib-chromium as well and want to make sure that both code contributions are licensed appropriately. For now, can we withdraw the patch/submission. I will resolve the issue and re-submit. |
||
|
||
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ | ||
while (length >= (int)sizeof(slice_ptr_int_type)) { | ||
crc = (uint32_t)crc_intrin_fn(crc, *input); | ||
input += sizeof(slice_ptr_int_type); | ||
length -= (int)sizeof(slice_ptr_int_type); | ||
} | ||
|
||
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input); | ||
input++; | ||
} | ||
|
||
return ~crc; | ||
} | ||
|
||
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
return aws_checksums_crc32_sw(input, length, previousCrc32); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are 16/32/64-bit variants of the CRC32 op that we should probably take advantage of.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will use the 64 bit version on x64 and 32bit on x86. That’s what those typedefs and defines in the private header are for.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nevermind, I was at the wrong place in the file. This branch is for tiny inputs, and it doesn’t seem worth optimizing to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would be interested in performance for small inputs as well - but I'd agree that's more likely to be dwarfed by surrounding code for sure.