diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp index 33f26e92..438ce369 100644 --- a/simd-checksum-x86_64.cpp +++ b/simd-checksum-x86_64.cpp @@ -85,9 +85,7 @@ typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, _ #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b)) #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b)) -#ifndef USE_ROLL_ASM -__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; } -#endif +__attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; } __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; } __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; } @@ -316,6 +314,10 @@ __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf #ifdef USE_ROLL_ASM /* { */ extern "C" __attribute__ ((target("avx2"))) int32 get_checksum1_avx2_asm(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2); +__attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) +{ + return get_checksum1_avx2_asm(buf, len, i, ps1, ps2); +} #else /* } { */ @@ -432,6 +434,10 @@ __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf } return i; } +__attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) +{ + return get_checksum1_avx2_64(buf, len, i, ps1, ps2); +} #endif /* } !USE_ROLL_ASM */ @@ -461,11 +467,7 @@ static inline uint32 get_checksum1_cpp(char *buf1, int32 len) uint32 s2 = 0; // multiples of 64 bytes using AVX2 (if available) -#ifdef USE_ROLL_ASM - i = get_checksum1_avx2_asm((schar*)buf1, len, i, &s1, &s2); -#else - i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2); -#endif + i = get_checksum1_avx2((schar*)buf1, len, i, &s1, &s2); // multiples of 32 bytes using SSSE3 (if available) i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2); @@ -534,11 +536,7 @@ int main() { benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN); benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN); benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN); -#ifdef USE_ROLL_ASM - benchmark("AVX2-ASM", get_checksum1_avx2_asm, (schar*)buf, BLOCK_LEN); -#else - benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN); -#endif + benchmark("AVX2", get_checksum1_avx2, (schar*)buf, BLOCK_LEN); free(buf); return 0;