diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h
index 00f3b522d..2c397d9c5 100644
--- a/include/volk/volk_avx2_intrinsics.h
+++ b/include/volk/volk_avx2_intrinsics.h
@@ -119,4 +119,116 @@ static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
     return _mm256_mul_ps(norms, scalar);
 }
 
+/*
+ * The function below vectorizes the inner loop of the following code:
+ *
+ * float max_values[8] = {0.f};
+ * unsigned max_indices[8] = {0};
+ * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ * for (unsigned i = 0; i < num_points / 8; ++i) {
+ *     for (unsigned j = 0; j < 8; ++j) {
+ *         float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
+ *         bool compare = abs_squared > max_values[j];
+ *         max_values[j] = compare ? abs_squared : max_values[j];
+ *         max_indices[j] = compare ? current_indices[j] > max_indices[j]
+ *         current_indices[j] += 8; // update for next outer loop iteration
+ *         ++src0;
+ *     }
+ * }
+ */
+static inline void vector_32fc_index_max_variant0(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* max_values,
+                                                  __m256i* max_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    /*
+     * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
+     * hadd_ps(a, b) computes
+     * (b_7 + b_6,
+     *  b_5 + b_4,
+     *  ---------
+     *  a_7 + b_6,
+     *  a_5 + a_4,
+     *  ---------
+     *  b_3 + b_2,
+     *  b_1 + b_0,
+     *  ---------
+     *  a_3 + a_2,
+     *  a_1 + a_0).
+     * The result is the squared absolute value of complex numbers at index
+     * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
+     * current_indices!
+     */
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+
+    /*
+     * Compare the recently computed squared absolute values with the
+     * previously determined maximum values. cmp_ps(a, b) determines
+     * a > b ? 0xFFFFFFFF for each element in the vectors =>
+     * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
+     *
+     * If either operand is NaN, 0 is returned as an “ordered” comparision is
+     * used => the blend operation will select the value from *max_values.
+     */
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
+
+    /* Select maximum by blending. This is the only line which differs from variant1 */
+    *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
+
+    /*
+     * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
+     * each element in the vectors =>
+     * max_indices = compare_mask ? current_indices : max_indices
+     *
+     * Note: The casting of data types is required to make the compiler happy
+     * and does not change values.
+     */
+    *max_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    /* compute indices of complex numbers which will be loaded in the next iteration */
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+/* See _variant0 for details */
+static inline void vector_32fc_index_max_variant1(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* max_values,
+                                                  __m256i* max_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
+
+    /*
+     * This is the only line which differs from variant0. Using maxps instead of
+     * blendvps is faster on Intel CPUs (on the ones tested with).
+     *
+     * Note: The order of arguments matters if a NaN is encountered in which
+     * case the value of the second argument is selected. This is consistent
+     * with the “ordered” comparision and the blend operation: The comparision
+     * returns false if a NaN is encountered and the blend operation
+     * consequently selects the value from max_indices.
+     */
+    *max_values = _mm256_max_ps(abs_squared, *max_values);
+
+    *max_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h
index 16e76cd96..80506cc87 100644
--- a/kernels/volk/volk_32fc_index_max_16u.h
+++ b/kernels/volk/volk_32fc_index_max_16u.h
@@ -84,82 +84,120 @@
 
 #ifdef LV_HAVE_AVX2
 #include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-static inline void
-volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
 {
     num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-    const uint32_t num_bytes = num_points * 8;
-
-    union bit256 holderf;
-    union bit256 holderi;
-    float sq_dist = 0.0;
-    float max = 0.0;
-    uint16_t index = 0;
-
-    union bit256 xmm5, xmm4;
-    __m256 xmm1, xmm2, xmm3;
-    __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
-
-    xmm5.int_vec = _mm256_setzero_si256();
-    xmm4.int_vec = _mm256_setzero_si256();
-    holderf.int_vec = _mm256_setzero_si256();
-    holderi.int_vec = _mm256_setzero_si256();
-
-    int bound = num_bytes >> 6;
-    int i = 0;
-
-    xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    xmm9 = _mm256_setzero_si256();
-    xmm10 = _mm256_set1_epi32(8);
-    xmm3 = _mm256_setzero_ps();
-
-    __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
-    for (; i < bound; ++i) {
-        xmm1 = _mm256_load_ps((float*)src0);
-        xmm2 = _mm256_load_ps((float*)&src0[4]);
 
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
         src0 += 8;
+    }
 
-        xmm1 = _mm256_mul_ps(xmm1, xmm1);
-        xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
 
-        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
 
-        xmm3 = _mm256_max_ps(xmm1, xmm3);
+    *target = index;
+}
 
-        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+#endif /*LV_HAVE_AVX2*/
 
-        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-        xmm9 = _mm256_add_epi32(xmm11, xmm12);
+static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
 
-        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
     }
 
-    _mm256_store_ps((float*)&(holderf.f), xmm3);
-    _mm256_store_si256(&(holderi.int_vec), xmm9);
-
-    for (i = 0; i < 8; i++) {
-        if (holderf.f[i] > max) {
-            index = holderi.i[i];
-            max = holderf.f[i];
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
         }
     }
 
-    for (i = bound * 8; i < num_points; i++, src0++) {
-        sq_dist =
-            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-
-        if (sq_dist > max) {
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
             index = i;
-            max = sq_dist;
         }
+        ++src0;
     }
-    target[0] = index;
+
+    *target = index;
 }
 
 #endif /*LV_HAVE_AVX2*/
@@ -323,82 +361,120 @@ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_
 
 #ifdef LV_HAVE_AVX2
 #include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-static inline void
-volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
 {
     num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
-    const uint32_t num_bytes = num_points * 8;
-
-    union bit256 holderf;
-    union bit256 holderi;
-    float sq_dist = 0.0;
-    float max = 0.0;
-    uint16_t index = 0;
-
-    union bit256 xmm5, xmm4;
-    __m256 xmm1, xmm2, xmm3;
-    __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
-
-    xmm5.int_vec = _mm256_setzero_si256();
-    xmm4.int_vec = _mm256_setzero_si256();
-    holderf.int_vec = _mm256_setzero_si256();
-    holderi.int_vec = _mm256_setzero_si256();
-
-    int bound = num_bytes >> 6;
-    int i = 0;
-
-    xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    xmm9 = _mm256_setzero_si256();
-    xmm10 = _mm256_set1_epi32(8);
-    xmm3 = _mm256_setzero_ps();
-
-    __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
-    for (; i < bound; ++i) {
-        xmm1 = _mm256_loadu_ps((float*)src0);
-        xmm2 = _mm256_loadu_ps((float*)&src0[4]);
 
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
         src0 += 8;
+    }
 
-        xmm1 = _mm256_mul_ps(xmm1, xmm1);
-        xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
 
-        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
 
-        xmm3 = _mm256_max_ps(xmm1, xmm3);
+    *target = index;
+}
 
-        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+#endif /*LV_HAVE_AVX2*/
 
-        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-        xmm9 = _mm256_add_epi32(xmm11, xmm12);
+static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
 
-        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
     }
 
-    _mm256_storeu_ps((float*)&(holderf.f), xmm3);
-    _mm256_storeu_si256(&(holderi.int_vec), xmm9);
-
-    for (i = 0; i < 8; i++) {
-        if (holderf.f[i] > max) {
-            index = holderi.i[i];
-            max = holderf.f[i];
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
         }
     }
 
-    for (i = bound * 8; i < num_points; i++, src0++) {
-        sq_dist =
-            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-
-        if (sq_dist > max) {
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
             index = i;
-            max = sq_dist;
         }
+        ++src0;
     }
-    target[0] = index;
+
+    *target = index;
 }
 
 #endif /*LV_HAVE_AVX2*/
diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h
index 556b5fc73..063895012 100644
--- a/kernels/volk/volk_32fc_index_max_32u.h
+++ b/kernels/volk/volk_32fc_index_max_32u.h
@@ -77,81 +77,116 @@
 
 #ifdef LV_HAVE_AVX2
 #include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-static inline void
-volk_32fc_index_max_32u_a_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
 {
-    const uint32_t num_bytes = num_points * 8;
-
-    union bit256 holderf;
-    union bit256 holderi;
-    float sq_dist = 0.0;
-    float max = 0.0;
-    uint32_t index = 0;
-
-    union bit256 xmm5, xmm4;
-    __m256 xmm1, xmm2, xmm3;
-    __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
-
-    xmm5.int_vec = _mm256_setzero_si256();
-    xmm4.int_vec = _mm256_setzero_si256();
-    holderf.int_vec = _mm256_setzero_si256();
-    holderi.int_vec = _mm256_setzero_si256();
-
-    int bound = num_bytes >> 6;
-    int i = 0;
-
-    xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    xmm9 = _mm256_setzero_si256();
-    xmm10 = _mm256_set1_epi32(8);
-    xmm3 = _mm256_setzero_ps();
-    __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
-
-    for (; i < bound; ++i) {
-        xmm1 = _mm256_load_ps((float*)src0);
-        xmm2 = _mm256_load_ps((float*)&src0[4]);
-
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
         src0 += 8;
+    }
 
-        xmm1 = _mm256_mul_ps(xmm1, xmm1);
-        xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
 
-        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
 
-        xmm3 = _mm256_max_ps(xmm1, xmm3);
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
 
-        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+    *target = index;
+}
 
-        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+#endif /*LV_HAVE_AVX2*/
 
-        xmm9 = _mm256_add_epi32(xmm11, xmm12);
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
     }
 
-    _mm256_store_ps((float*)&(holderf.f), xmm3);
-    _mm256_store_si256(&(holderi.int_vec), xmm9);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
 
-    for (i = 0; i < 8; i++) {
-        if (holderf.f[i] > max) {
-            index = holderi.i[i];
-            max = holderf.f[i];
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
         }
     }
 
-    for (i = bound * 8; i < num_points; i++, src0++) {
-        sq_dist =
-            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-
-        if (sq_dist > max) {
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
             index = i;
-            max = sq_dist;
         }
+        ++src0;
     }
-    target[0] = index;
+
+    *target = index;
 }
 
 #endif /*LV_HAVE_AVX2*/
@@ -311,81 +346,116 @@ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_
 
 #ifdef LV_HAVE_AVX2
 #include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-static inline void
-volk_32fc_index_max_32u_u_avx2(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
 {
-    const uint32_t num_bytes = num_points * 8;
-
-    union bit256 holderf;
-    union bit256 holderi;
-    float sq_dist = 0.0;
-    float max = 0.0;
-    uint32_t index = 0;
-
-    union bit256 xmm5, xmm4;
-    __m256 xmm1, xmm2, xmm3;
-    __m256i xmm8, xmm11, xmm12, xmm9, xmm10;
-
-    xmm5.int_vec = _mm256_setzero_si256();
-    xmm4.int_vec = _mm256_setzero_si256();
-    holderf.int_vec = _mm256_setzero_si256();
-    holderi.int_vec = _mm256_setzero_si256();
-
-    int bound = num_bytes >> 6;
-    int i = 0;
-
-    xmm8 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-    xmm9 = _mm256_setzero_si256();
-    xmm10 = _mm256_set1_epi32(8);
-    xmm3 = _mm256_setzero_ps();
-    __m256i idx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
-
-    for (; i < bound; ++i) {
-        xmm1 = _mm256_loadu_ps((float*)src0);
-        xmm2 = _mm256_loadu_ps((float*)&src0[4]);
-
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
         src0 += 8;
+    }
 
-        xmm1 = _mm256_mul_ps(xmm1, xmm1);
-        xmm2 = _mm256_mul_ps(xmm2, xmm2);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
 
-        xmm1 = _mm256_hadd_ps(xmm1, xmm2);
-        xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
 
-        xmm3 = _mm256_max_ps(xmm1, xmm3);
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
 
-        xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_LT_OS);
-        xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, _CMP_EQ_OQ);
+    *target = index;
+}
 
-        xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
-        xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
+#endif /*LV_HAVE_AVX2*/
 
-        xmm9 = _mm256_add_epi32(xmm11, xmm12);
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
 
-        xmm8 = _mm256_add_epi32(xmm8, xmm10);
+static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
     }
 
-    _mm256_storeu_ps((float*)&(holderf.f), xmm3);
-    _mm256_storeu_si256(&(holderi.int_vec), xmm9);
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
 
-    for (i = 0; i < 8; i++) {
-        if (holderf.f[i] > max) {
-            index = holderi.i[i];
-            max = holderf.f[i];
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
         }
     }
 
-    for (i = bound * 8; i < num_points; i++, src0++) {
-        sq_dist =
-            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
-
-        if (sq_dist > max) {
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
             index = i;
-            max = sq_dist;
         }
+        ++src0;
     }
-    target[0] = index;
+
+    *target = index;
 }
 
 #endif /*LV_HAVE_AVX2*/