Skip to content

Commit

Permalink
Revert "Horner's rule for polynomial evaluation with symmetry idea di…
Browse files Browse the repository at this point in the history
…ccussed …"

This reverts commit 4ea0096.
  • Loading branch information
lu1and10 authored Jul 18, 2024
1 parent 4ea0096 commit e30a3fa
Showing 1 changed file with 8 additions and 77 deletions.
85 changes: 8 additions & 77 deletions src/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ namespace { // anonymous namespace for internal structs equivalent to declaring
// static
struct zip_low;
struct zip_hi;
template<unsigned cap> struct reverse_index;
template<unsigned cap> struct shuffle_index;
struct select_even;
struct select_odd;
// forward declaration to clean up the code and be able to use this everywhere in the file
Expand Down Expand Up @@ -779,80 +777,23 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in
// [-1,1]
if (opts.upsampfac == 2.0) { // floating point equality is fine here
using arch_t = typename simd_type::arch_type;
static constexpr auto alignment = arch_t::alignment();
static constexpr auto alignment = simd_type::arch_type::alignment();
static constexpr auto simd_size = simd_type::size;
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
static constexpr auto nc = nc200<w>();
static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>();
static constexpr auto use_ker_sym = (simd_size < w);

alignas(alignment) static constexpr auto padded_coeffs =
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);

// use kernel symmetry trick if w > simd_size
if constexpr (use_ker_sym) {
static constexpr uint8_t tail = w % simd_size;
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size;
static constexpr uint8_t end_idx = (w + (tail > 0)) / 2;
const simd_type zv{z};
const auto z2v = zv * zv;

// some xsimd constant for shuffle or inverse
static constexpr auto shuffle_batch = []() constexpr noexcept {
if constexpr (tail) {
return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
shuffle_index<tail>>();
} else {
return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
reverse_index<simd_size>>();
}
}();

// process simd vecs
simd_type k_prev, k_sym{0};
for (uint8_t i{0}, offset = offset_start; i < end_idx;
i += simd_size, offset -= simd_size) {
auto k_odd = [i]() constexpr noexcept {
if constexpr (if_odd_degree) {
return simd_type::load_aligned(padded_coeffs[0].data() + i);
} else {
return simd_type{0};
}
}();
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
k_even = xsimd::fma(k_even, z2v, cji_even);
}
// left part
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
// right part symmetric to the left part
if (offset >= end_idx) {
if constexpr (tail) {
// to use aligned store, we need shuffle the previous k_sym and current k_sym
k_prev = k_sym;
k_sym = xsimd::fnma(k_odd, zv, k_even);
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
} else {
xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch)
.store_aligned(ker + offset);
}
}
}
} else {
const simd_type zv(z);
for (uint8_t i = 0; i < w; i += simd_size) {
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
for (uint8_t j = 1; j < nc; ++j) {
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
k = xsimd::fma(k, zv, cji);
}
k.store_aligned(ker + i);
const simd_type zv(z);
for (uint8_t i = 0; i < w; i += simd_size) {
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
for (uint8_t j = 1; j < nc; ++j) {
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
k = xsimd::fma(k, zv, cji);
}
k.store_aligned(ker + i);
}
return;
}
Expand Down Expand Up @@ -2227,16 +2168,6 @@ struct zip_hi {
return (size + index) / 2;
}
};
template<unsigned cap> struct reverse_index {
static constexpr unsigned get(unsigned index, const unsigned size) {
return index < cap ? (cap - 1 - index) : index;
}
};
template<unsigned cap> struct shuffle_index {
static constexpr unsigned get(unsigned index, const unsigned size) {
return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
}
};

struct select_even {
static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
Expand Down

0 comments on commit e30a3fa

Please sign in to comment.