Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX512 support, new PR to replace old branch from contributor. We need CI to run so moving it here. #72

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c6248e0
Added CRC32C AVX512 support.
javazque Jun 22, 2023
cf22bca
Fixed routine name to indicate crc32c
pbadari Jun 27, 2023
def3a68
Merge pull request #1 from pbadari/avx512_support
pbadari Jun 27, 2023
375fa35
Add sse42 avx512_intrinsics support
javazque Jul 14, 2023
9e18b50
Merge pull request #2 from pbadari/sse42_avx512_intrinsics
pbadari Jul 14, 2023
469ef71
Merge branch 'main' into main
JonathanHenson Jul 14, 2023
2eb5578
Refactoring work for the AVX512 code path. Testing shows it not quite…
JonathanHenson Jul 19, 2023
837d5a1
Keep the naive avx512 path on for figuring out codebuild capabilities…
JonathanHenson Jul 19, 2023
2289c96
Fix build and do correct cpu feature detection.
JonathanHenson Jul 19, 2023
ee3e5da
fix 32-bit builds and builds that need to work without intrinsics ava…
JonathanHenson Jul 19, 2023
005ed7c
Not sure how the avx512 code got called. hopefully coedebuild is just…
JonathanHenson Jul 19, 2023
39094d4
Found why the wrong build files were being used at least.
JonathanHenson Jul 19, 2023
d4ffdc1
Make test pass when it passes.
JonathanHenson Jul 19, 2023
bf79936
Try it again.
JonathanHenson Jul 19, 2023
1e24d06
fix leftover symbol collision.
JonathanHenson Jul 19, 2023
907e721
Added more compile gates and assertions.
JonathanHenson Jul 19, 2023
5ab0046
Fix osx build.
JonathanHenson Jul 20, 2023
ca43c51
make the bitflips uniform.
JonathanHenson Jul 20, 2023
28dde8b
add additional runtime cpuid check and run formatter.
JonathanHenson Jul 20, 2023
a00a8e3
work around nasty bitflipping logic.
JonathanHenson Jul 20, 2023
5138407
Addressed review comments, use ternary logic instructions and optimiz…
pbadari Jan 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A
# Append that generated list to the module search path
list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH})

include(AwsSIMD)
include(AwsCFlags)
include(AwsCheckHeaders)
include(AwsSharedLibSetup)
Expand Down Expand Up @@ -58,17 +59,48 @@ file(GLOB AWS_ARCH_SRC
)

if (USE_CPU_EXTENSIONS)
if(AWS_ARCH_INTEL)
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
if(AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_SRC
"source/intel/asm/*.c"
if (AWS_ARCH_INTEL)
file (GLOB AWS_ARCH_INTEL_SRC
"source/intel/*.c"
)

if (AWS_HAVE_AVX512_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8)
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
"source/intel/visualc/*.c"
)
elseif (MSVC)
file(GLOB AWS_ARCH_SRC
else()
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
)
endif()
else()
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/visualc/*.c"
)
endif()
endif()

source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})

if (AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_ASM_SRC
"source/intel/asm/*.c"
)

file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
${AWS_ARCH_ASM_SRC}
)
else()
file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
)
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
endif()
endif()

Expand Down Expand Up @@ -115,6 +147,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC


add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})

aws_set_common_properties(${PROJECT_NAME})
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
Expand All @@ -124,6 +157,10 @@ aws_add_sanitizers(${PROJECT_NAME})
# We are not ABI stable yet
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)

if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
simd_add_source_avx(${PROJECT_NAME} ${AWS_ARCH_SRC})
endif()

target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
Expand Down
25 changes: 25 additions & 0 deletions include/aws/checksums/private/intel/crc32c_compiler_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>

#include <aws/common/config.h>
#include <nmmintrin.h>

#if _WIN64 || __x86_64__ || __ppc64_
typedef uint64_t *slice_ptr_type;
typedef uint64_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u64
#else
typedef uint32_t *slice_ptr_type;
typedef uint32_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u32
#endif

#ifdef AWS_HAVE_AVX512_INTRINSICS
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
#endif

uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
22 changes: 9 additions & 13 deletions source/intel/asm/crc32c_sse42_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>

#include <aws/common/cpuid.h>

Expand Down Expand Up @@ -283,7 +283,7 @@ static bool detected_clmul = false;
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
Expand All @@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
detection_performed = true;
}

uint32_t crc = ~previousCrc32;
/* this is called by a higher-level shim and previousCRC32 is already ~ */
uint32_t crc = previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
Expand Down Expand Up @@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev

return ~crc;
}
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

# if defined(__clang__)
# pragma clang diagnostic pop
# endif

#else
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32c_sw(input, length, previousCrc32);
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
However, the sw function is also used as a standalone implementation that does need to do the
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
}

#endif
/* clang-format on */
101 changes: 101 additions & 0 deletions source/intel/crc_hw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
#include <aws/common/cpuid.h>

static bool detection_performed = false;
static bool detected_sse42 = false;
static bool detected_avx512 = false;
static bool detected_clmul = false;
static bool detected_vpclmulqdq = false;

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
* PCLMULQDQ machine instructions (if present).
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ);

/* Simply setting the flag true to skip HW detection next time
Not using memory barriers since the worst that can
happen is a fallback to the non HW accelerated code. */
detection_performed = true;
}

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
while (length-- > 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are 16/32/64-bit variants of the CRC32 op that we should probably take advantage of.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will use the 64 bit version on x64 and 32bit on x86. That’s what those typedefs and defines in the private header are for.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will use the 64 bit version on x64 and 32bit on x86. That’s what those typedefs and defines in the private header are for.

Nevermind, I was at the wrong place in the file. This branch is for tiny inputs, and it doesn’t seem worth optimizing to me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be interested in performance for small inputs as well - but I'd agree that's more likely to be dwarfed by surrounding code for sure.

crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
return ~crc;
}

/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (uintptr_t)(input)&0x7;

/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;

/* reduce the length by the leading unaligned bytes we are about to process */
length -= leading;

/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}

#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX)
int chunk_size = length & ~63;

if (detected_avx512 && detected_vpclmulqdq && detected_clmul) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say detect_clmul is not needed, as it will always be true if detected_vpclmulqdq is true

if (length >= 256) {
crc = aws_checksums_crc32c_avx512(input, length, crc);
/* check remaining data */
length -= chunk_size;
if (!length) {
return ~crc;
}

/* Fall into the default crc32 for the remaining data. */
input += chunk_size;
}
}
#endif

if (detected_sse42 && detected_clmul) {
return aws_checksums_crc32c_sse42(input, length, crc);
}
Comment on lines +79 to +81
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will the sse42 impl be worth invoking after we've finished processing the lead portion using the avx512 implementation?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the comments. I will review them closely.

For full disclosure - I re-used the avx512 intrinsic implementation from https://chromium.googlesource.com/chromium/src/third_party/zlib/+/b890619bc2b193b8fbe9c1c053f4cd19a9791d92/crc32_simd.c

but recomputed constants for crc32c polynomial :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pbadari I've got the build surgery done and tests passing if you'd like to work on the avx512 comments from bdonlan@

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you'll need the AVX512 branch from aws-c-common until we merge it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I am reviewing the avx512 comments from bdonlan

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jonathan, Most of the review comments (from Donlan) for the AVX512 code is further performance improvements which require careful re-write/proto-type and performance analysis. I reached out to our area expert for his input. I am wondering if we can merge the current patch and update it further when we have new code ready? Please let me know.

Copy link
Contributor Author

@JonathanHenson JonathanHenson Jul 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, AVX512 has a history of being an incredibly risky addition to an already functioning (possibly already running really hot) system. Various chipset versions introduce timing issues for side-channel attacks as well as side-effects to other processes sharing the CPU. So we're going to have to run a lot of tests before we can just run this in production anyways, and we'd like the code to be close to structured in the actual final algorithm before running that analysis.

This particular code runs for a lot of S3 PUT and GET operations across multiple SDKs, so any side-effects would most likely be felt across a large blast radius.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are submitting AVX-512 based implementation of crc32 to zlib-chromium as well and want to make sure that both code contributions are licensed appropriately. For now, can we withdraw the patch/submission. I will resolve the issue and re-submit.


/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (length >= (int)sizeof(slice_ptr_int_type)) {
crc = (uint32_t)crc_intrin_fn(crc, *input);
input += sizeof(slice_ptr_int_type);
length -= (int)sizeof(slice_ptr_int_type);
}

/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input);
input++;
}

return ~crc;
}

uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}
Loading