diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h index 00f3b522d..2c397d9c5 100644 --- a/include/volk/volk_avx2_intrinsics.h +++ b/include/volk/volk_avx2_intrinsics.h @@ -119,4 +119,116 @@ static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, return _mm256_mul_ps(norms, scalar); } +/* + * The function below vectorizes the inner loop of the following code: + * + * float max_values[8] = {0.f}; + * unsigned max_indices[8] = {0}; + * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + * for (unsigned i = 0; i < num_points / 8; ++i) { + * for (unsigned j = 0; j < 8; ++j) { + * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) + * bool compare = abs_squared > max_values[j]; + * max_values[j] = compare ? abs_squared : max_values[j]; + * max_indices[j] = compare ? current_indices[j] > max_indices[j] + * current_indices[j] += 8; // update for next outer loop iteration + * ++src0; + * } + * } + */ +static inline void vector_32fc_index_max_variant0(__m256 in0, + __m256 in1, + __m256* max_values, + __m256i* max_indices, + __m256i* current_indices, + __m256i indices_increment) +{ + in0 = _mm256_mul_ps(in0, in0); + in1 = _mm256_mul_ps(in1, in1); + + /* + * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0) + * hadd_ps(a, b) computes + * (b_7 + b_6, + * b_5 + b_4, + * --------- + * a_7 + b_6, + * a_5 + a_4, + * --------- + * b_3 + b_2, + * b_1 + b_0, + * --------- + * a_3 + a_2, + * a_1 + a_0). + * The result is the squared absolute value of complex numbers at index + * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of + * current_indices! + */ + __m256 abs_squared = _mm256_hadd_ps(in0, in1); + + /* + * Compare the recently computed squared absolute values with the + * previously determined maximum values. cmp_ps(a, b) determines + * a > b ? 0xFFFFFFFF for each element in the vectors => + * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0 + * + * If either operand is NaN, 0 is returned as an “ordered” comparision is + * used => the blend operation will select the value from *max_values. + */ + __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS); + + /* Select maximum by blending. This is the only line which differs from variant1 */ + *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask); + + /* + * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for + * each element in the vectors => + * max_indices = compare_mask ? current_indices : max_indices + * + * Note: The casting of data types is required to make the compiler happy + * and does not change values. + */ + *max_indices = + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices), + _mm256_castsi256_ps(*current_indices), + compare_mask)); + + /* compute indices of complex numbers which will be loaded in the next iteration */ + *current_indices = _mm256_add_epi32(*current_indices, indices_increment); +} + +/* See _variant0 for details */ +static inline void vector_32fc_index_max_variant1(__m256 in0, + __m256 in1, + __m256* max_values, + __m256i* max_indices, + __m256i* current_indices, + __m256i indices_increment) +{ + in0 = _mm256_mul_ps(in0, in0); + in1 = _mm256_mul_ps(in1, in1); + + __m256 abs_squared = _mm256_hadd_ps(in0, in1); + __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS); + + /* + * This is the only line which differs from variant0. Using maxps instead of + * blendvps is faster on Intel CPUs (on the ones tested with). + * + * Note: The order of arguments matters if a NaN is encountered in which + * case the value of the second argument is selected. This is consistent + * with the “ordered” comparision and the blend operation: The comparision + * returns false if a NaN is encountered and the blend operation + * consequently selects the value from max_indices. + */ + *max_values = _mm256_max_ps(abs_squared, *max_values); + + *max_indices = + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices), + _mm256_castsi256_ps(*current_indices), + compare_mask)); + + *current_indices = _mm256_add_epi32(*current_indices, indices_increment); +} + #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h index 16e76cd96..80506cc87 100644 --- a/kernels/volk/volk_32fc_index_max_16u.h +++ b/kernels/volk/volk_32fc_index_max_16u.h @@ -84,82 +84,120 @@ #ifdef LV_HAVE_AVX2 #include +#include -static inline void -volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - const uint32_t num_bytes = num_points * 8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; - float max = 0.0; - uint16_t index = 0; - - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - - xmm5.int_vec = _mm256_setzero_si256(); - xmm4.int_vec = _mm256_setzero_si256(); - holderf.int_vec = _mm256_setzero_si256(); - holderi.int_vec = _mm256_setzero_si256(); - - int bound = num_bytes >> 6; - int i = 0; - - xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - - __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); - for (; i < bound; ++i) { - xmm1 = _mm256_load_ps((float*)src0); - xmm2 = _mm256_load_ps((float*)&src0[4]); + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant0( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); src0 += 8; + } - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); + + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; + } + } - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; + index = i; + } + ++src0; + } - xmm3 = _mm256_max_ps(xmm1, xmm3); + *target = index; +} - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +#endif /*LV_HAVE_AVX2*/ - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +#ifdef LV_HAVE_AVX2 +#include +#include - xmm9 = _mm256_add_epi32(xmm11, xmm12); +static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - xmm8 = _mm256_add_epi32(xmm8, xmm10); + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant1( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); + src0 += 8; } - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); - - for (i = 0; i < 8; i++) { - if (holderf.f[i] > max) { - index = holderi.i[i]; - max = holderf.f[i]; + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); + + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; } } - for (i = bound * 8; i < num_points; i++, src0++) { - sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - if (sq_dist > max) { + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; index = i; - max = sq_dist; } + ++src0; } - target[0] = index; + + *target = index; } #endif /*LV_HAVE_AVX2*/ @@ -323,82 +361,120 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ #ifdef LV_HAVE_AVX2 #include +#include -static inline void -volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) { num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - const uint32_t num_bytes = num_points * 8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; - float max = 0.0; - uint16_t index = 0; - - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - - xmm5.int_vec = _mm256_setzero_si256(); - xmm4.int_vec = _mm256_setzero_si256(); - holderf.int_vec = _mm256_setzero_si256(); - holderi.int_vec = _mm256_setzero_si256(); - - int bound = num_bytes >> 6; - int i = 0; - - xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - - __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); - for (; i < bound; ++i) { - xmm1 = _mm256_loadu_ps((float*)src0); - xmm2 = _mm256_loadu_ps((float*)&src0[4]); + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant0( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); src0 += 8; + } - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); + + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; + } + } - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; + index = i; + } + ++src0; + } - xmm3 = _mm256_max_ps(xmm1, xmm3); + *target = index; +} - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); +#endif /*LV_HAVE_AVX2*/ - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +#ifdef LV_HAVE_AVX2 +#include +#include - xmm9 = _mm256_add_epi32(xmm11, xmm12); +static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; - xmm8 = _mm256_add_epi32(xmm8, xmm10); + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant1( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); + src0 += 8; } - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); - - for (i = 0; i < 8; i++) { - if (holderf.f[i] > max) { - index = holderi.i[i]; - max = holderf.f[i]; + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); + + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; } } - for (i = bound * 8; i < num_points; i++, src0++) { - sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - if (sq_dist > max) { + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; index = i; - max = sq_dist; } + ++src0; } - target[0] = index; + + *target = index; } #endif /*LV_HAVE_AVX2*/ diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h index 556b5fc73..063895012 100644 --- a/kernels/volk/volk_32fc_index_max_32u.h +++ b/kernels/volk/volk_32fc_index_max_32u.h @@ -77,81 +77,116 @@ #ifdef LV_HAVE_AVX2 #include +#include -static inline void -volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) { - const uint32_t num_bytes = num_points * 8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; - float max = 0.0; - uint32_t index = 0; - - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - - xmm5.int_vec = _mm256_setzero_si256(); - xmm4.int_vec = _mm256_setzero_si256(); - holderf.int_vec = _mm256_setzero_si256(); - holderi.int_vec = _mm256_setzero_si256(); - - int bound = num_bytes >> 6; - int i = 0; - - xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); - - for (; i < bound; ++i) { - xmm1 = _mm256_load_ps((float*)src0); - xmm2 = _mm256_load_ps((float*)&src0[4]); - + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant0( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); src0 += 8; + } - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; + } + } - xmm3 = _mm256_max_ps(xmm1, xmm3); + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; + index = i; + } + ++src0; + } - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + *target = index; +} - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +#endif /*LV_HAVE_AVX2*/ - xmm9 = _mm256_add_epi32(xmm11, xmm12); +#ifdef LV_HAVE_AVX2 +#include +#include - xmm8 = _mm256_add_epi32(xmm8, xmm10); +static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_load_ps((float*)src0); + __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant1( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); + src0 += 8; } - _mm256_store_ps((float*)&(holderf.f), xmm3); - _mm256_store_si256(&(holderi.int_vec), xmm9); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); - for (i = 0; i < 8; i++) { - if (holderf.f[i] > max) { - index = holderi.i[i]; - max = holderf.f[i]; + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; } } - for (i = bound * 8; i < num_points; i++, src0++) { - sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - if (sq_dist > max) { + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; index = i; - max = sq_dist; } + ++src0; } - target[0] = index; + + *target = index; } #endif /*LV_HAVE_AVX2*/ @@ -311,81 +346,116 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ #ifdef LV_HAVE_AVX2 #include +#include -static inline void -volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) +static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) { - const uint32_t num_bytes = num_points * 8; - - union bit256 holderf; - union bit256 holderi; - float sq_dist = 0.0; - float max = 0.0; - uint32_t index = 0; - - union bit256 xmm5, xmm4; - __m256 xmm1, xmm2, xmm3; - __m256i xmm8, xmm11, xmm12, xmm9, xmm10; - - xmm5.int_vec = _mm256_setzero_si256(); - xmm4.int_vec = _mm256_setzero_si256(); - holderf.int_vec = _mm256_setzero_si256(); - holderi.int_vec = _mm256_setzero_si256(); - - int bound = num_bytes >> 6; - int i = 0; - - xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - xmm9 = _mm256_setzero_si256(); - xmm10 = _mm256_set1_epi32(8); - xmm3 = _mm256_setzero_ps(); - __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7); - - for (; i < bound; ++i) { - xmm1 = _mm256_loadu_ps((float*)src0); - xmm2 = _mm256_loadu_ps((float*)&src0[4]); - + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant0( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); src0 += 8; + } - xmm1 = _mm256_mul_ps(xmm1, xmm1); - xmm2 = _mm256_mul_ps(xmm2, xmm2); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); - xmm1 = _mm256_hadd_ps(xmm1, xmm2); - xmm1 = _mm256_permutevar8x32_ps(xmm1, idx); + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; + } + } - xmm3 = _mm256_max_ps(xmm1, xmm3); + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; + index = i; + } + ++src0; + } - xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS); - xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ); + *target = index; +} - xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec); - xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec); +#endif /*LV_HAVE_AVX2*/ - xmm9 = _mm256_add_epi32(xmm11, xmm12); +#ifdef LV_HAVE_AVX2 +#include +#include - xmm8 = _mm256_add_epi32(xmm8, xmm10); +static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target, + lv_32fc_t* src0, + uint32_t num_points) +{ + const __m256i indices_increment = _mm256_set1_epi32(8); + /* + * At the start of each loop iteration current_indices holds the indices of + * the complex numbers loaded from memory. Explanation for odd order is given + * in implementation of vector_32fc_index_max_variant0(). + */ + __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256 max_values = _mm256_setzero_ps(); + __m256i max_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { + __m256 in0 = _mm256_loadu_ps((float*)src0); + __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); + vector_32fc_index_max_variant1( + in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment); + src0 += 8; } - _mm256_storeu_ps((float*)&(holderf.f), xmm3); - _mm256_storeu_si256(&(holderi.int_vec), xmm9); + // determine maximum value and index in the result of the vectorized loop + __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8]; + __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8]; + _mm256_store_ps(max_values_buffer, max_values); + _mm256_store_si256((__m256i*)max_indices_buffer, max_indices); - for (i = 0; i < 8; i++) { - if (holderf.f[i] > max) { - index = holderi.i[i]; - max = holderf.f[i]; + float max = 0.f; + uint32_t index = 0; + for (unsigned i = 0; i < 8; i++) { + if (max_values_buffer[i] > max) { + max = max_values_buffer[i]; + index = max_indices_buffer[i]; } } - for (i = bound * 8; i < num_points; i++, src0++) { - sq_dist = - lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - if (sq_dist > max) { + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = + lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); + if (abs_squared > max) { + max = abs_squared; index = i; - max = sq_dist; } + ++src0; } - target[0] = index; + + *target = index; } #endif /*LV_HAVE_AVX2*/