From 0a17287425a527d02aa4698b3c4ba4e102b73f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20M=C3=BCller?= Date: Mon, 23 Oct 2023 18:41:58 +0200 Subject: [PATCH] granular byte-parallel generic kernel for byteswap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcus Müller --- kernels/volk/volk_64u_byteswap.h | 52 +++++++++++----------- kernels/volk/volk_64u_byteswappuppet_64u.h | 11 +++++ 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h index 22bccab2d..919faf662 100644 --- a/kernels/volk/volk_64u_byteswap.h +++ b/kernels/volk/volk_64u_byteswap.h @@ -56,6 +56,30 @@ #include #include +#ifdef LV_HAVE_GENERIC +/* Adapted from https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + * Where they reverse the bits in an N-bit word. But who's stoppng me from doing the same + * on byte level? + * Idea is simple: swap the elementary units with half of them "selected" each step, in a + * Hadamard kind of selection. + */ + +static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, + unsigned int num_points) +{ + for (unsigned int point = 0; point < num_points; point++, intsToSwap++) { + uint64_t in = *intsToSwap; + /* swap individual bytes */ + in = (in & 0x00FF00FF00FF00FF) << 8 | (in & 0xFF00FF00FF00FF00) >> 8; + /* swap individual shorts */ + in = (in & 0x0000FFFF0000FFFF) << 16 | (in & 0xFFFF0000FFFF0000) >> 16; + /* swap the two 32 bit words */ + in = (in & 0x00000000FFFFFFFF) << 32 | (in & 0xFFFFFFFF00000000) >> 32; + *intsToSwap = in; + } +} +#endif + #ifdef LV_HAVE_SSE2 #include @@ -109,30 +133,6 @@ static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int n } #endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC - -static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap, - unsigned int num_points) -{ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for (point = 0; point < num_points; point++) { - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | - ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | - ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - #if LV_HAVE_AVX2 #include static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points) @@ -476,8 +476,8 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap, #ifdef LV_HAVE_GENERIC -static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap, - unsigned int num_points) +static inline void volk_64u_byteswap_generic_decompose(uint64_t* intsToSwap, + unsigned int num_points) { uint32_t* inputPtr = (uint32_t*)intsToSwap; unsigned int point; diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h index b1004bb04..9bbb37e7c 100644 --- a/kernels/volk/volk_64u_byteswappuppet_64u.h +++ b/kernels/volk/volk_64u_byteswappuppet_64u.h @@ -15,6 +15,17 @@ #include #include +#ifdef LV_HAVE_GENERIC +static inline void volk_64u_byteswappuppet_64u_generic_decompose(uint64_t* output, + uint64_t* intsToSwap, + unsigned int num_points) +{ + + volk_64u_byteswap_generic_decompose((uint64_t*)intsToSwap, num_points); + memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t)); +} +#endif + #ifdef LV_HAVE_GENERIC static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output, uint64_t* intsToSwap,