-
Notifications
You must be signed in to change notification settings - Fork 81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Horner's rule for polynomial evaluation with symmetry idea diccussed in discussions #461 #477
Changes from 13 commits
026d8d3
da5d950
fb6dc75
dd54cc7
1ae8821
26be4b4
52bf0e6
4d25c51
dd5dd2a
a7bb9b4
c28ca33
eefac07
7fe3c99
c9fded5
d04ffcf
6dcf55d
2f1f13f
9d9b64c
5a68272
8747df5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,8 @@ namespace { // anonymous namespace for internal structs equivalent to declaring | |
// static | ||
struct zip_low; | ||
struct zip_hi; | ||
template<unsigned cap> struct reverse_index; | ||
template<unsigned cap> struct shuffle_index; | ||
struct select_even; | ||
struct select_odd; | ||
// forward declaration to clean up the code and be able to use this everywhere in the file | ||
|
@@ -777,24 +779,99 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ | |
const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in | ||
// [-1,1] | ||
if (opts.upsampfac == 2.0) { // floating point equality is fine here | ||
static constexpr auto alignment = simd_type::arch_type::alignment(); | ||
using arch_t = typename simd_type::arch_type; | ||
static constexpr auto alignment = arch_t::alignment(); | ||
static constexpr auto simd_size = simd_type::size; | ||
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); | ||
static constexpr auto nc = nc200<w>(); | ||
static constexpr auto horner_coeffs = get_horner_coeffs_200<FLT, w>(); | ||
static constexpr auto use_ker_sym = (simd_size < w); | ||
|
||
alignas(alignment) static constexpr auto padded_coeffs = | ||
pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs); | ||
|
||
const simd_type zv(z); | ||
for (uint8_t i = 0; i < w; i += simd_size) { | ||
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); | ||
for (uint8_t j = 1; j < nc; ++j) { | ||
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); | ||
k = xsimd::fma(k, zv, cji); | ||
// use kernel symmetry trick if w > simd_size | ||
if constexpr (use_ker_sym) { | ||
static constexpr uint8_t tail = w % simd_size; | ||
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); | ||
static const simd_type zerov(0.0); | ||
const simd_type zv(z); | ||
const simd_type z2v = zv * zv; | ||
|
||
// no xsimd::shuffle neeeded if tail is zero | ||
if constexpr (tail) { | ||
// some xsimd constant for shuffle | ||
static constexpr auto shuffle_batch = | ||
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, | ||
shuffle_index<tail>>(); | ||
|
||
// process simd vecs | ||
simd_type k_odd, k_even, k_prev, k_sym = zerov; | ||
for (uint8_t i = 0, offset = w - tail; i < (w + 1) / 2; | ||
i += simd_size, offset -= simd_size) { | ||
k_odd = if_odd_degree ? simd_type::load_aligned(padded_coeffs[0].data() + i) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be: k_odd = [i]() constexpr noexept{
if constexpr(if_odd_degree){
return simd_type::load_aligned(padded_coeffs[0].data() + i);
} else return simd_type{0};
}(); |
||
: zerov; | ||
k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); | ||
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { | ||
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); | ||
k_odd = xsimd::fma(k_odd, z2v, cji_odd); | ||
const auto cji_even = | ||
simd_type::load_aligned(padded_coeffs[j + 1].data() + i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is [j+1] safe here? The loop is up to nc, should not be up to (nc-1) or (nc&~1) |
||
k_even = xsimd::fma(k_even, z2v, cji_even); | ||
} | ||
// left part | ||
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); | ||
// right part symmetric to the left part | ||
if (offset >= (w + 1) / 2) { | ||
// to use aligned store, we need shuffle the previous k_sym and current k_sym | ||
k_prev = k_sym; | ||
k_sym = xsimd::fma(k_odd, -zv, k_even); | ||
lu1and10 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); | ||
} | ||
} | ||
} else { | ||
// xsimd constants for reverse | ||
static constexpr auto reverse_batch = | ||
xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, | ||
reverse_index<simd_size>>(); | ||
|
||
// process simd vecs | ||
for (uint8_t i = 0, offset = w - simd_size; i < w / 2; | ||
i += simd_size, offset -= simd_size) { | ||
auto k_odd = if_odd_degree | ||
? simd_type::load_aligned(padded_coeffs[0].data() + i) | ||
: zerov; | ||
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); | ||
for (uint8_t j = 1 + if_odd_degree; j < nc; j += 2) { | ||
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); | ||
k_odd = xsimd::fma(k_odd, z2v, cji_odd); | ||
const auto cji_even = | ||
simd_type::load_aligned(padded_coeffs[j + 1].data() + i); | ||
k_even = xsimd::fma(k_even, z2v, cji_even); | ||
} | ||
// left part | ||
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); | ||
// right part symmetric to the left part | ||
if (offset >= w / 2) { | ||
// reverse the order for symmetric part | ||
xsimd::swizzle(xsimd::fma(k_odd, -zv, k_even), reverse_batch) | ||
.store_aligned(ker + offset); | ||
} | ||
} | ||
} | ||
} else { | ||
const simd_type zv(z); | ||
|
||
for (uint8_t i = 0; i < w; i += simd_size) { | ||
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); | ||
for (uint8_t j = 1; j < nc; ++j) { | ||
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); | ||
k = xsimd::fma(k, zv, cji); | ||
} | ||
k.store_aligned(ker + i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is quite a lot of repetition I think you could do something like w/2 + (tail>0) to merge most of the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you give more of a hint what you're suggesting here? Or maybe Libin sees it... |
||
} | ||
k.store_aligned(ker + i); | ||
} | ||
|
||
return; | ||
} | ||
// insert the auto-generated code which expects z, w args, writes to ker... | ||
|
@@ -2168,6 +2245,16 @@ struct zip_hi { | |
return (size + index) / 2; | ||
} | ||
}; | ||
template<unsigned cap> struct reverse_index { | ||
static constexpr unsigned get(unsigned index, const unsigned size) { | ||
return index < cap ? (cap - 1 - index) : index; | ||
} | ||
}; | ||
template<unsigned cap> struct shuffle_index { | ||
static constexpr unsigned get(unsigned index, const unsigned size) { | ||
return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; | ||
} | ||
}; | ||
|
||
struct select_even { | ||
static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in my experiments retuning {0} is often faster than saving it as a static const