From 026d8d39c901f7c611797e935ce4380e4a99cee1 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 2 Jul 2024 14:42:32 -0400 Subject: [PATCH 01/13] test kernel sym with aligned store --- src/spreadinterp.cpp | 128 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 117 insertions(+), 11 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 3212e6705..a47f0582b 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -24,6 +24,14 @@ namespace { // anonymous namespace for internal structs equivalent to declaring // static struct zip_low; struct zip_hi; +template +struct reverse_index; +template +struct select_index; +template +struct reverse_index_tail; +template +struct shuffle_index; // forward declaration to clean up the code and be able to use this everywhere in the file template static constexpr auto BestSIMDHelper(); template constexpr auto GetPaddedSIMDWidth(); @@ -521,15 +529,15 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( if (!(opts.flags & TF_OMIT_SPREADING)) { switch (ndims) { case 1: - ker_eval(kernel_values.data(), opts, x1); + ker_eval(kernel_values.data(), opts, x1); interp_line(target, data_uniform, ker1, i1, N1); break; case 2: - ker_eval(kernel_values.data(), opts, x1, x2); + ker_eval(kernel_values.data(), opts, x1, x2); interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, N2); break; case 3: - ker_eval(kernel_values.data(), opts, x1, x2, x3); + ker_eval(kernel_values.data(), opts, x1, x2, x3); interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2, N3); break; @@ -760,25 +768,99 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in // [-1,1] if (opts.upsampfac == 2.0) { // floating point equality is fine here - static constexpr auto alignment = simd_type::arch_type::alignment(); + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); static constexpr auto nc = nc200(); static constexpr auto horner_coeffs = get_horner_coeffs_200(); + static constexpr auto use_ker_sym = (simd_size < w); alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); - const simd_type zv(z); + // use kernel symmetry trick if w > simd_size + if constexpr (use_ker_sym) { + static constexpr uint8_t tail = w % simd_size; + static constexpr uint8_t if_odd_degree = ((nc+1) % 2); + static const simd_type zerov(0.0); + const simd_type zv(z); + const simd_type z2v = zv * zv; + + // no xsimd::select neeeded if tail is zero + if constexpr (tail) { + // some xsimd constants for shuffle + //static constexpr auto reverse_batch_head = xsimd::make_batch_constant, arch_t, reverse_index>(); + //static constexpr auto reverse_batch_tail = xsimd::make_batch_constant, arch_t, reverse_index_tail>(); + static constexpr auto shuffle_batch = xsimd::make_batch_constant, arch_t, shuffle_index>(); + //static constexpr auto select_batch = xsimd::make_batch_bool_constant>(); + + // process simd vecs + simd_type k_odd, k_even, k_prev, k_sym = zerov; + for (uint8_t i = 0, offset = w - tail; i < (w+1)/2; i += simd_size, offset -= simd_size) { + k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; + k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); + } + xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + if (offset >= (w+1)/2) { + k_prev = k_sym; + k_sym = xsimd::fma(k_odd, -zv, k_even); + xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); + /* + if (i==0) { + // save one xsimd::swizzle for the first iteration(k_prev is zerov) + // by assumption, ker is padded to be multiple of simd_size + // the padded part must be zero because in spread_subproblem_*d_kernel, trg has out of bound writes. + xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), zerov).store_aligned(ker + offset); + } + else { + // xsimd::select of two xsimd::swizzle is the xsimd::shuffle for the general shuffle case + //xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), xsimd::swizzle(k_prev, reverse_batch_tail)).store_aligned(ker + offset); + xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); + } + */ + } + } + } + else { + // xsimd constants for reverse + static constexpr auto reverse_batch = xsimd::make_batch_constant, arch_t, reverse_index>(); + + // process simd vecs + for (uint8_t i = 0, offset = w - simd_size; i < w/2; i += simd_size, offset -= simd_size) { + auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; + auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); + } + xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + if(offset >= w/2) { + xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch).store_aligned(ker + offset); + } + } + } + } + else { + const simd_type zv(z); - for (uint8_t i = 0; i < w; i += simd_size) { - auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); - for (uint8_t j = 1; j < nc; ++j) { - const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); - k = xsimd::fma(k, zv, cji); + for (uint8_t i = 0; i < w; i += simd_size) { + auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); + for (uint8_t j = 1; j < nc; ++j) { + const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); + k = xsimd::fma(k, zv, cji); + } + k.store_aligned(ker + i); } - k.store_aligned(ker + i); } + return; } // insert the auto-generated code which expects z, w args, writes to ker... @@ -1928,6 +2010,30 @@ struct zip_hi { return (size + index) / 2; } }; +template +struct reverse_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : index; + } +}; +template +struct select_index { + static constexpr bool get(unsigned index, const unsigned size) { + return index < cap ? 1 : 0; + } +}; +template +struct reverse_index_tail { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? index : size + cap - 1 - index; + } +}; +template +struct shuffle_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; + } +}; void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, From 1ae88211535b626d913f04ee697b2c7ed4937456 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 3 Jul 2024 16:30:38 -0400 Subject: [PATCH 02/13] add Horner sym eval without explicit aligned store(Martin does this in ducc) --- src/spreadinterp.cpp | 66 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index ac6bc2bc1..34b3cb7e2 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -75,6 +75,8 @@ template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; +static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner_unaligned_store( + FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; template static void interp_line(FLT *FINUFFT_RESTRICT out, const FLT *du, const FLT *ker, BIGINT i1, BIGINT N1); @@ -762,6 +764,66 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; } +template // aka ns +void eval_kernel_vec_Horner_unaligned_store(FLT *FINUFFT_RESTRICT ker, const FLT x, + const finufft_spread_opts &opts) noexcept +/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at +x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. +This is the current evaluation method, since it's faster (except i7 w=16). +Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ + +{ + const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in + // [-1,1] + if (opts.upsampfac == 2.0) { // floating point equality is fine here + static constexpr auto alignment = simd_type::arch_type::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); + static constexpr auto nc = nc200(); + static constexpr auto horner_coeffs = get_horner_coeffs_200(); + + alignas(alignment) static constexpr auto padded_coeffs = + pad_2D_array_with_zeros(horner_coeffs); + + static constexpr uint8_t nvec = (w+simd_size-1)/simd_size; + static constexpr uint8_t nvec_eval = (nvec+1)/2; + static constexpr uint8_t n_eval = simd_size*nvec_eval; + static constexpr uint8_t if_odd_degree = ((nc+1) % 2); + static const simd_type zerov(0.0); + const simd_type zv(z); + const simd_type z2v = zv * zv; + alignas(alignment) std::array sym_{}; + + // process simd vecs + for (uint8_t i = 0; i < n_eval; i += simd_size) { + auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; + auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); + } + + // left + xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + + // right + xsimd::fma(k_odd, -zv, k_even).store_aligned(sym_.data()); + // let compiler optimize the store, probably unaligned? + for (uint8_t j=0, j2=w-1-i; (j=n_eval); ++j,--j2) { + ker[j2] = sym_[j]; + } + } + return; + } + // insert the auto-generated code which expects z, w args, writes to ker... + if (opts.upsampfac == 1.25) { +#include "ker_lowupsampfac_horner_allw_loop_constexpr.c" + return; + } +} + template // aka ns void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, const finufft_spread_opts &opts) noexcept @@ -798,8 +860,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ // some xsimd constants for shuffle //static constexpr auto reverse_batch_head = xsimd::make_batch_constant, arch_t, reverse_index>(); //static constexpr auto reverse_batch_tail = xsimd::make_batch_constant, arch_t, reverse_index_tail>(); - static constexpr auto shuffle_batch = xsimd::make_batch_constant, arch_t, shuffle_index>(); //static constexpr auto select_batch = xsimd::make_batch_bool_constant>(); + static constexpr auto shuffle_batch = xsimd::make_batch_constant, arch_t, shuffle_index>(); // process simd vecs simd_type k_odd, k_even, k_prev, k_sym = zerov; @@ -818,6 +880,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ k_sym = xsimd::fma(k_odd, -zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); /* + // the following is the equivalent code for the shuffle operation to avoid one swizzle in the first iteration + // seems not helping the performance if (i==0) { // save one xsimd::swizzle for the first iteration(k_prev is zerov) // by assumption, ker is padded to be multiple of simd_size From 4d25c5154aed4aabd82e2e80db664c1198cbdc9c Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 9 Jul 2024 12:05:48 -0400 Subject: [PATCH 03/13] revert passing simd_type to ker_eval in interp, this is done in interp PR #471 --- src/spreadinterp.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 34b3cb7e2..fff411a43 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -531,15 +531,15 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( if (!(opts.flags & TF_OMIT_SPREADING)) { switch (ndims) { case 1: - ker_eval(kernel_values.data(), opts, x1); + ker_eval(kernel_values.data(), opts, x1); interp_line(target, data_uniform, ker1, i1, N1); break; case 2: - ker_eval(kernel_values.data(), opts, x1, x2); + ker_eval(kernel_values.data(), opts, x1, x2); interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, N2); break; case 3: - ker_eval(kernel_values.data(), opts, x1, x2, x3); + ker_eval(kernel_values.data(), opts, x1, x2, x3); interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2, N3); break; From dd5dd2a40a5bf6f1a73f86442fd59419da1af4fb Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Thu, 11 Jul 2024 12:44:46 -0400 Subject: [PATCH 04/13] clean up --- src/spreadinterp.cpp | 173 +++++++++++-------------------------------- 1 file changed, 42 insertions(+), 131 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index fff411a43..9f681c9f0 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -24,14 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring // static struct zip_low; struct zip_hi; -template -struct reverse_index; -template -struct select_index; -template -struct reverse_index_tail; -template -struct shuffle_index; +template struct reverse_index; +template struct shuffle_index; // forward declaration to clean up the code and be able to use this everywhere in the file template static constexpr auto BestSIMDHelper(); template constexpr auto GetPaddedSIMDWidth(); @@ -752,8 +746,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts if (opts.kerpad) { // padded part should be zero, in spread_subproblem_nd_kernels, there are // out of bound writes to trg arrays - for (int i = N; i < Npad; ++i) - ker[i] = 0.0; + for (int i = N; i < Npad; ++i) ker[i] = 0.0; } } else { for (int i = 0; i < N; i++) // dummy for timing only @@ -764,66 +757,6 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; } -template // aka ns -void eval_kernel_vec_Horner_unaligned_store(FLT *FINUFFT_RESTRICT ker, const FLT x, - const finufft_spread_opts &opts) noexcept -/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at -x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. -This is the current evaluation method, since it's faster (except i7 w=16). -Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ - -{ - const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in - // [-1,1] - if (opts.upsampfac == 2.0) { // floating point equality is fine here - static constexpr auto alignment = simd_type::arch_type::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); - static constexpr auto nc = nc200(); - static constexpr auto horner_coeffs = get_horner_coeffs_200(); - - alignas(alignment) static constexpr auto padded_coeffs = - pad_2D_array_with_zeros(horner_coeffs); - - static constexpr uint8_t nvec = (w+simd_size-1)/simd_size; - static constexpr uint8_t nvec_eval = (nvec+1)/2; - static constexpr uint8_t n_eval = simd_size*nvec_eval; - static constexpr uint8_t if_odd_degree = ((nc+1) % 2); - static const simd_type zerov(0.0); - const simd_type zv(z); - const simd_type z2v = zv * zv; - alignas(alignment) std::array sym_{}; - - // process simd vecs - for (uint8_t i = 0; i < n_eval; i += simd_size) { - auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; - auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); - k_even = xsimd::fma(k_even, z2v, cji_even); - } - - // left - xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - - // right - xsimd::fma(k_odd, -zv, k_even).store_aligned(sym_.data()); - // let compiler optimize the store, probably unaligned? - for (uint8_t j=0, j2=w-1-i; (j=n_eval); ++j,--j2) { - ker[j2] = sym_[j]; - } - } - return; - } - // insert the auto-generated code which expects z, w args, writes to ker... - if (opts.upsampfac == 1.25) { -#include "ker_lowupsampfac_horner_allw_loop_constexpr.c" - return; - } -} - template // aka ns void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, const finufft_spread_opts &opts) noexcept @@ -849,76 +782,68 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ // use kernel symmetry trick if w > simd_size if constexpr (use_ker_sym) { - static constexpr uint8_t tail = w % simd_size; - static constexpr uint8_t if_odd_degree = ((nc+1) % 2); + static constexpr uint8_t tail = w % simd_size; + static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); static const simd_type zerov(0.0); const simd_type zv(z); const simd_type z2v = zv * zv; // no xsimd::select neeeded if tail is zero if constexpr (tail) { - // some xsimd constants for shuffle - //static constexpr auto reverse_batch_head = xsimd::make_batch_constant, arch_t, reverse_index>(); - //static constexpr auto reverse_batch_tail = xsimd::make_batch_constant, arch_t, reverse_index_tail>(); - //static constexpr auto select_batch = xsimd::make_batch_bool_constant>(); - static constexpr auto shuffle_batch = xsimd::make_batch_constant, arch_t, shuffle_index>(); + // some xsimd constant for shuffle + static constexpr auto shuffle_batch = + xsimd::make_batch_constant, arch_t, + shuffle_index>(); // process simd vecs simd_type k_odd, k_even, k_prev, k_sym = zerov; - for (uint8_t i = 0, offset = w - tail; i < (w+1)/2; i += simd_size, offset -= simd_size) { - k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; + for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2; + i += simd_size, offset -= simd_size) { + k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) + : zerov; k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); - k_even = xsimd::fma(k_even, z2v, cji_even); + for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = + simd_type::load_aligned(padded_coeffs[j + 1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); } xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - if (offset >= (w+1)/2) { + if (offset >= (w + 1) / 2) { k_prev = k_sym; - k_sym = xsimd::fma(k_odd, -zv, k_even); + k_sym = xsimd::fma(k_odd, -zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); - /* - // the following is the equivalent code for the shuffle operation to avoid one swizzle in the first iteration - // seems not helping the performance - if (i==0) { - // save one xsimd::swizzle for the first iteration(k_prev is zerov) - // by assumption, ker is padded to be multiple of simd_size - // the padded part must be zero because in spread_subproblem_*d_kernel, trg has out of bound writes. - xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), zerov).store_aligned(ker + offset); - } - else { - // xsimd::select of two xsimd::swizzle is the xsimd::shuffle for the general shuffle case - //xsimd::select(select_batch, xsimd::swizzle(k_sym, reverse_batch_head), xsimd::swizzle(k_prev, reverse_batch_tail)).store_aligned(ker + offset); - xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); - } - */ } } - } - else { + } else { // xsimd constants for reverse - static constexpr auto reverse_batch = xsimd::make_batch_constant, arch_t, reverse_index>(); + static constexpr auto reverse_batch = + xsimd::make_batch_constant, arch_t, + reverse_index>(); // process simd vecs - for (uint8_t i = 0, offset = w - simd_size; i < w/2; i += simd_size, offset -= simd_size) { - auto k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) : zerov; + for (uint8_t i = 0, offset = w - simd_size; i < w / 2; + i += simd_size, offset -= simd_size) { + auto k_odd = if_odd_degree + ? simd_type::load_aligned(padded_coeffs[0].data() + i) + : zerov; auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1+if_odd_degree; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - const auto cji_even = simd_type::load_aligned(padded_coeffs[j+1].data() + i); - k_even = xsimd::fma(k_even, z2v, cji_even); + for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = + simd_type::load_aligned(padded_coeffs[j + 1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); } xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - if(offset >= w/2) { - xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch).store_aligned(ker + offset); + if (offset >= w / 2) { + xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch) + .store_aligned(ker + offset); } } } - } - else { + } else { const simd_type zv(z); for (uint8_t i = 0; i < w; i += simd_size) { @@ -2080,26 +2005,12 @@ struct zip_hi { return (size + index) / 2; } }; -template -struct reverse_index { +template struct reverse_index { static constexpr unsigned get(unsigned index, const unsigned size) { return index < cap ? (cap - 1 - index) : index; } }; -template -struct select_index { - static constexpr bool get(unsigned index, const unsigned size) { - return index < cap ? 1 : 0; - } -}; -template -struct reverse_index_tail { - static constexpr unsigned get(unsigned index, const unsigned size) { - return index < cap ? index : size + cap - 1 - index; - } -}; -template -struct shuffle_index { +template struct shuffle_index { static constexpr unsigned get(unsigned index, const unsigned size) { return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; } From a7bb9b422751b5a28284834f7b3e2bd6a90af1c5 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Thu, 11 Jul 2024 12:50:51 -0400 Subject: [PATCH 05/13] removed unused declare --- src/spreadinterp.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 9f681c9f0..e049ab69a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -69,8 +69,6 @@ template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; -static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner_unaligned_store( - FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; template static void interp_line(FLT *FINUFFT_RESTRICT out, const FLT *du, const FLT *ker, BIGINT i1, BIGINT N1); From eefac071a891123dd3e762002c3c398cd66dd081 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Thu, 11 Jul 2024 13:05:32 -0400 Subject: [PATCH 06/13] add some comments --- src/spreadinterp.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index d858ebd60..ea3736593 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -798,7 +798,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ const simd_type zv(z); const simd_type z2v = zv * zv; - // no xsimd::select neeeded if tail is zero + // no xsimd::shuffle neeeded if tail is zero if constexpr (tail) { // some xsimd constant for shuffle static constexpr auto shuffle_batch = @@ -819,8 +819,11 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ simd_type::load_aligned(padded_coeffs[j + 1].data() + i); k_even = xsimd::fma(k_even, z2v, cji_even); } + // left part xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + // right part symmetric to the left part if (offset >= (w + 1) / 2) { + // to use aligned store, we need shuffle the previous k_sym and current k_sym k_prev = k_sym; k_sym = xsimd::fma(k_odd, -zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); @@ -846,8 +849,11 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ simd_type::load_aligned(padded_coeffs[j + 1].data() + i); k_even = xsimd::fma(k_even, z2v, cji_even); } + // left part xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + // right part symmetric to the left part if (offset >= w / 2) { + // reverse the order for symmetric part xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch) .store_aligned(ker + offset); } From c9fded59530e0cac4961f8ae54e596371176c9bd Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Fri, 12 Jul 2024 12:14:02 -0400 Subject: [PATCH 07/13] change to use fnma in sym part --- src/spreadinterp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index ea3736593..38371d17a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -825,7 +825,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ if (offset >= (w + 1) / 2) { // to use aligned store, we need shuffle the previous k_sym and current k_sym k_prev = k_sym; - k_sym = xsimd::fma(k_odd, -zv, k_even); + k_sym = xsimd::fnma(k_odd, zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); } } @@ -854,7 +854,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ // right part symmetric to the left part if (offset >= w / 2) { // reverse the order for symmetric part - xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch) + xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), reverse_batch) .store_aligned(ker + offset); } } From d04ffcfde66de28f8cd3c75066b1b42ae6358d2a Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 16 Jul 2024 12:55:35 -0400 Subject: [PATCH 08/13] cleanup a bit, a bit slower --- src/spreadinterp.cpp | 90 ++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 53 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 38371d17a..b820af054 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -794,67 +794,51 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ if constexpr (use_ker_sym) { static constexpr uint8_t tail = w % simd_size; static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); - static const simd_type zerov(0.0); + static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; + static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; const simd_type zv(z); const simd_type z2v = zv * zv; - // no xsimd::shuffle neeeded if tail is zero - if constexpr (tail) { - // some xsimd constant for shuffle - static constexpr auto shuffle_batch = - xsimd::make_batch_constant, arch_t, - shuffle_index>(); - - // process simd vecs - simd_type k_odd, k_even, k_prev, k_sym = zerov; - for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2; - i += simd_size, offset -= simd_size) { - k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) - : zerov; - k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - const auto cji_even = - simd_type::load_aligned(padded_coeffs[j + 1].data() + i); - k_even = xsimd::fma(k_even, z2v, cji_even); + // some xsimd constant for shuffle or inverse + static constexpr auto shuffle_batch = []() constexpr noexcept { + if constexpr (tail) { + return xsimd::make_batch_constant, arch_t, + shuffle_index>(); + } else { + return xsimd::make_batch_constant, arch_t, + reverse_index>(); + } + }(); + + // process simd vecs + simd_type k_odd, k_even, k_prev, k_sym{0}; + for (uint8_t i = 0, offset = offset_start; i < end_idx; + i += simd_size, offset -= simd_size) { + k_odd = [i]() constexpr noexcept { + if constexpr (if_odd_degree) { + return simd_type::load_aligned(padded_coeffs[0].data() + i); + } else { + return simd_type{0}; } - // left part - xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - // right part symmetric to the left part - if (offset >= (w + 1) / 2) { + }(); + k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); + k_even = xsimd::fma(k_even, z2v, cji_even); + } + // left part + xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + // right part symmetric to the left part + if (offset >= end_idx) { + if constexpr (tail) { // to use aligned store, we need shuffle the previous k_sym and current k_sym k_prev = k_sym; k_sym = xsimd::fnma(k_odd, zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); - } - } - } else { - // xsimd constants for reverse - static constexpr auto reverse_batch = - xsimd::make_batch_constant, arch_t, - reverse_index>(); - - // process simd vecs - for (uint8_t i = 0, offset = w - simd_size; i < w / 2; - i += simd_size, offset -= simd_size) { - auto k_odd = if_odd_degree - ? simd_type::load_aligned(padded_coeffs[0].data() + i) - : zerov; - auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - const auto cji_even = - simd_type::load_aligned(padded_coeffs[j + 1].data() + i); - k_even = xsimd::fma(k_even, z2v, cji_even); - } - // left part - xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - // right part symmetric to the left part - if (offset >= w / 2) { - // reverse the order for symmetric part - xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), reverse_batch) + } else { + xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch) .store_aligned(ker + offset); } } From 6dcf55d544f0004bfbc7a5f0facb590b3184f0bf Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 16 Jul 2024 14:57:05 -0400 Subject: [PATCH 09/13] small fixes --- src/spreadinterp.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index b820af054..14a057062 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -796,8 +796,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv(z); - const simd_type z2v = zv * zv; + const simd_type zv{z}; + const simd_type z2v{zv * zv}; // some xsimd constant for shuffle or inverse static constexpr auto shuffle_batch = []() constexpr noexcept { @@ -811,21 +811,24 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ }(); // process simd vecs - simd_type k_odd, k_even, k_prev, k_sym{0}; - for (uint8_t i = 0, offset = offset_start; i < end_idx; + struct EmptySimd {}; + // these exist only if tail > 0 + typename std::conditional<(tail > 0), simd_type, EmptySimd>::type k_prev, k_sym; + if constexpr (tail) k_sym = {0}; + for (uint8_t i{0}, offset = offset_start; i < end_idx; i += simd_size, offset -= simd_size) { - k_odd = [i]() constexpr noexcept { + auto k_odd = [i]() constexpr noexcept { if constexpr (if_odd_degree) { return simd_type::load_aligned(padded_coeffs[0].data() + i); } else { return simd_type{0}; } }(); - k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { + auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) { const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); k_even = xsimd::fma(k_even, z2v, cji_even); } // left part @@ -845,7 +848,6 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ } } else { const simd_type zv(z); - for (uint8_t i = 0; i < w; i += simd_size) { auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); for (uint8_t j = 1; j < nc; ++j) { @@ -855,7 +857,6 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ k.store_aligned(ker + i); } } - return; } // insert the auto-generated code which expects z, w args, writes to ker... From 2f1f13fecc74bf6e0962b2d9121b96628eaec093 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 16 Jul 2024 17:02:23 -0400 Subject: [PATCH 10/13] fixed compile flags that was breaking clang code --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a3a61fba1..0bf33a035 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,6 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) -fno-trapping-math -fassociative-math -freciprocal-math - -fmerge-all-constants -ftree-vectorize ) # if -fimplicit-constexpr is supported, add it to the list of flags From 9d9b64c1bcc51bd1703975938606e7e06360b2cb Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 17 Jul 2024 12:16:17 -0400 Subject: [PATCH 11/13] remove conditional declaration --- CMakeLists.txt | 1 + src/spreadinterp.cpp | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bf33a035..a3a61fba1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) -fno-trapping-math -fassociative-math -freciprocal-math + -fmerge-all-constants -ftree-vectorize ) # if -fimplicit-constexpr is supported, add it to the list of flags diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 14a057062..169e09dbf 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -811,9 +811,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ }(); // process simd vecs - struct EmptySimd {}; - // these exist only if tail > 0 - typename std::conditional<(tail > 0), simd_type, EmptySimd>::type k_prev, k_sym; + simd_type k_prev, k_sym; if constexpr (tail) k_sym = {0}; for (uint8_t i{0}, offset = offset_start; i < end_idx; i += simd_size, offset -= simd_size) { From 5a6827273fbd6a262b1a4147c704432653f6064b Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 17 Jul 2024 12:36:56 -0400 Subject: [PATCH 12/13] try to make -fmerge-all-constants work --- src/spreadinterp.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 169e09dbf..aff8112d4 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -796,8 +796,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv{z}; - const simd_type z2v{zv * zv}; + const simd_type zv(z); + const simd_type z2v = zv * zv; // some xsimd constant for shuffle or inverse static constexpr auto shuffle_batch = []() constexpr noexcept { @@ -811,8 +811,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ }(); // process simd vecs - simd_type k_prev, k_sym; - if constexpr (tail) k_sym = {0}; + simd_type k_prev, k_sym{0}; for (uint8_t i{0}, offset = offset_start; i < end_idx; i += simd_size, offset -= simd_size) { auto k_odd = [i]() constexpr noexcept { From 8747df58a3f90494d39abec3392b6f071cc0a9e7 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 17 Jul 2024 14:16:15 -0400 Subject: [PATCH 13/13] use auto for z2v --- src/spreadinterp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index aff8112d4..3f03c79bb 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -796,8 +796,8 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv(z); - const simd_type z2v = zv * zv; + const simd_type zv{z}; + const auto z2v = zv * zv; // some xsimd constant for shuffle or inverse static constexpr auto shuffle_batch = []() constexpr noexcept {