Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve encoder performance #19

Merged
merged 7 commits into from
Feb 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 111 additions & 71 deletions include/qoixx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,19 @@ class qoi{
};
struct rgb_t{
std::uint8_t r, g, b;
inline std::uint32_t v()const{
static_assert(sizeof(rgb_t) == 3u);
if constexpr(std::endian::native == std::endian::little){
std::uint32_t x = 255u << 24u;
efficient_memcpy<3>(&x, this);
return x;
}
else
return std::uint32_t{r} |
std::uint32_t{g} << 8 |
std::uint32_t{b} << 16 |
255u << 24;
}
inline std::uint_fast32_t hash()const{
static constexpr std::uint64_t constant =
static_cast<std::uint64_t>(3u) << 56 |
Expand Down Expand Up @@ -302,12 +315,29 @@ class qoi{
}
}
private:
template<bool Alpha>
using local_rgba_pixel_t = std::conditional_t<Alpha, rgba_t, rgb_t>;
template<bool Alpha>
static constexpr local_rgba_pixel_t<Alpha> default_pixel()noexcept{
if constexpr(Alpha)
return {0, 0, 0, 255};
else
return {};
}
template<bool Alpha>
struct local_pixel{
std::uint8_t rgb = static_cast<std::uint8_t>(chunk_tag::rgb);
local_rgba_pixel_t<Alpha> v;
};
static_assert(std::has_unique_object_representations_v<local_pixel<true>> and std::has_unique_object_representations_v<local_pixel<false>>);
template<std::uint_fast8_t Channels, typename Pusher, typename Puller>
static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, rgba_t px_prev = {0, 0, 0, 255}, std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
const auto f = [&run, &index, &p, &prev_hash](rgba_t px, rgba_t px_prev){
if(px == px_prev){
static inline void encode_body(Pusher& p, Puller& pixels, rgba_t (&index)[index_size], std::size_t px_len, local_rgba_pixel_t<Channels == 4u> px_prev = default_pixel<Channels == 4u>(), std::uint8_t prev_hash = static_cast<std::uint8_t>(index_size), std::size_t run = 0){
local_pixel<Channels == 4u> px;
while(px_len--)[[likely]]{
pull<Channels>(&px.v, pixels);
if(px.v.v() == px_prev.v()){
++run;
return;
continue;
}
if(run > 0){
while(run >= 62)[[unlikely]]{
Expand All @@ -328,57 +358,55 @@ class qoi{
}
}

const auto index_pos = px.hash() % index_size;
const auto index_pos = px.v.hash() % index_size;
prev_hash = index_pos;

if(index[index_pos] == px){
p.push(chunk_tag::index | index_pos);
return;
}
index[index_pos] = px;

if constexpr(Channels == 4)
if(px.a != px_prev.a){
p.push(chunk_tag::rgba);
push<4>(p, &px);
return;
do{
if(index[index_pos].v() == px.v.v()){
p.push(chunk_tag::index | index_pos);
break;
}
const auto vr = static_cast<int>(px.r) - static_cast<int>(px_prev.r) + 2;
const auto vg = static_cast<int>(px.g) - static_cast<int>(px_prev.g) + 2;
const auto vb = static_cast<int>(px.b) - static_cast<int>(px_prev.b) + 2;
efficient_memcpy<Channels>(index + index_pos, &px.v);
if constexpr(Channels == 3)
index[index_pos].a = 255u;

if(const std::uint8_t v = vr|vg|vb; v < 4){
p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
return;
}
const auto vg_r = vr - vg + 8;
const auto vg_b = vb - vg + 8;
if(const int v = vg_r|vg_b, g = vg+30; ((v&0xf0)|(g&0xc0)) == 0){
p.push(chunk_tag::luma | g);
p.push(vg_r << 4 | vg_b);
}
else{
p.push(chunk_tag::rgb);
push<3>(p, &px);
}
};
auto px = px_prev;
while(px_len--)[[likely]]{
px_prev = px;
pull<Channels>(&px, pixels);
f(px, px_prev);
if constexpr(Channels == 4)
if(px.v.a != px_prev.a){
p.push(chunk_tag::rgba);
push<4>(p, &px.v);
break;
}
const auto vg_2 = static_cast<int>(px.v.g) - static_cast<int>(px_prev.g);
if(const std::uint8_t g = vg_2+32; g < 64){
const auto vr = static_cast<int>(px.v.r) - static_cast<int>(px_prev.r) + 2;
const auto vg = vg_2 + 2;
const auto vb = static_cast<int>(px.v.b) - static_cast<int>(px_prev.b) + 2;

if(static_cast<std::uint8_t>(vr|vg|vb) < 4){
p.push(chunk_tag::diff | vr << 4 | vg << 2 | vb);
break;
}
const auto vg_r = vr - vg + 8;
const auto vg_b = vb - vg + 8;
if(static_cast<std::uint8_t>(vg_r|vg_b) < 16){
p.push(chunk_tag::luma | g);
p.push(vg_r << 4 | vg_b);
}
else
push<4>(p, &px);
}
else
push<4>(p, &px);
}while(false);
efficient_memcpy<Channels>(&px_prev, &px.v);
}
if(px == px_prev){
while(run >= 62)[[unlikely]]{
static constexpr std::uint8_t x = chunk_tag::run | 61;
p.push(x);
run -= 62;
}
if(run > 0){
p.push(chunk_tag::run | (run-1));
run = 0;
}
while(run >= 62)[[unlikely]]{
static constexpr std::uint8_t x = chunk_tag::run | 61;
p.push(x);
run -= 62;
}
if(run > 0)
p.push(chunk_tag::run | (run-1));
}
#ifndef QOIXX_NO_SIMD
#if defined(__ARM_FEATURE_SVE)
Expand Down Expand Up @@ -488,12 +516,12 @@ class qoi{
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), svmul_n_u8_x(mask, get<3>(pxs), 11))), 63);
else
hash = svand_n_u8_x(mask, svadd_u8_x(mask, svadd_u8_x(mask, svmul_n_u8_x(mask, get<0>(pxs), 3), svmul_n_u8_x(mask, get<1>(pxs), 5)), svadd_n_u8_x(mask, svmul_n_u8_x(mask, get<2>(pxs), 7), static_cast<std::uint8_t>(255*11))), 63);
std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lus[SVERegisterSize/8], mas[SVERegisterSize/8], hashs[SVERegisterSize/8];
std::uint8_t runs[SVERegisterSize/8], diffs[SVERegisterSize/8], lumas[SVERegisterSize/8*2], hashs[SVERegisterSize/8];
[[maybe_unused]] std::uint8_t alphas[SVERegisterSize/8];
svst1_u8(mask, runs, svadd_n_u8_m(runv, zero, 1));
svst1_u8(mask, diffs, diffv);
svst1_u8(mask, lus, lu);
svst1_u8(mask, mas, ma);
const auto luma = svcreate2_u8(lu, ma);
svst2_u8(mask, lumas, luma);
svst1_u8(mask, hashs, hash);
if constexpr(Alpha)
if(!alpha)
Expand Down Expand Up @@ -534,9 +562,9 @@ class qoi{
}
if(diffs[i])
*p++ = diffs[i];
else if(lus[i]){
*p++ = lus[i];
*p++ = mas[i];
else if(lumas[i*2]){
std::memcpy(p, lumas + i*2, 2);
p += 2;
}
else{
*p++ = chunk_tag::rgb;
Expand Down Expand Up @@ -655,12 +683,11 @@ class qoi{
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vmulq_u8(pxs.val[3], vdupq_n_u8(11)))), vdupq_n_u8(63));
else
hash = vandq_u8(vaddq_u8(vaddq_u8(vmulq_u8(pxs.val[0], vdupq_n_u8(3)), vmulq_u8(pxs.val[1], vdupq_n_u8(5))), vaddq_u8(vmulq_u8(pxs.val[2], vdupq_n_u8(7)), vdupq_n_u8(static_cast<std::uint8_t>(255*11)))), vdupq_n_u8(63));
std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes];
std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
[[maybe_unused]] std::uint8_t alphas[simd_lanes];
vst1q_u8(runs, runv);
vst1q_u8(diffs, diffv);
vst1q_u8(lus, lu);
vst1q_u8(mas, ma);
vst2q_u8(lumas, (uint8x16x2_t{lu, ma}));
vst1q_u8(hashs, hash);
if constexpr(Alpha)
if(!alpha)
Expand Down Expand Up @@ -701,9 +728,9 @@ class qoi{
}
if(diffs[i])
*p++ = diffs[i];
else if(lus[i]){
*p++ = lus[i];
*p++ = mas[i];
else if(lumas[i*2]){
std::memcpy(p, lumas + i*2, 2);
p += 2;
}
else{
*p++ = chunk_tag::rgb;
Expand All @@ -715,7 +742,13 @@ class qoi{
}
p_.advance(p-p_.raw_pointer());

encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
if constexpr(Alpha)
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
else{
rgb_t px_prev;
efficient_memcpy<3>(&px_prev, &px);
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
}

push<sizeof(padding)>(p_, padding);
}
Expand Down Expand Up @@ -920,19 +953,20 @@ class qoi{
diff.val[0] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[0], diff.val[1]), eight);
diff.val[2] = _mm256_add_epi8(_mm256_sub_epi8(diff.val[2], diff.val[1]), eight);
diff.val[1] = _mm256_add_epi8(diff.val[1], _mm256_set1_epi8(30));
const auto lu = _mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero));
const auto ma = _mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]);
const auto luma_mask = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
const auto lu = _mm256_permutevar8x32_epi32(_mm256_and_si256(_mm256_or_si256(_mm256_set1_epi8(static_cast<char>(chunk_tag::luma)), diff.val[1]), _mm256_cmpeq_epi8(_mm256_or_si256(_mm256_and_si256(_mm256_or_si256(diff.val[0], diff.val[2]), _mm256_set1_epi8(static_cast<char>(0xf0))), _mm256_and_si256(diff.val[1], _mm256_set1_epi8(static_cast<char>(0xc0)))), zero)), luma_mask);
const auto ma = _mm256_permutevar8x32_epi32(_mm256_or_si256(slli_epi8<4>(diff.val[0]), diff.val[2]), luma_mask);
__m256i hash;
if constexpr(Alpha)
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), mul_epi8<11>(pxs.val[3]))), _mm256_set1_epi8(63));
else
hash = _mm256_and_si256(_mm256_add_epi8(_mm256_add_epi8(mul_epi8<3>(pxs.val[0]), mul_epi8<5>(pxs.val[1])), _mm256_add_epi8(mul_epi8<7>(pxs.val[2]), _mm256_set1_epi8(static_cast<std::uint8_t>(255*11)))), _mm256_set1_epi8(63));
alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lus[simd_lanes], mas[simd_lanes], hashs[simd_lanes];
alignas(alignof(__m256i)) std::uint8_t runs[simd_lanes], diffs[simd_lanes], lumas[simd_lanes*2], hashs[simd_lanes];
[[maybe_unused]] alignas(alignof(__m256i)) std::uint8_t alphas[simd_lanes];
_mm256_store_si256(reinterpret_cast<__m256i*>(runs), runv);
_mm256_store_si256(reinterpret_cast<__m256i*>(diffs), diffv);
_mm256_store_si256(reinterpret_cast<__m256i*>(lus), lu);
_mm256_store_si256(reinterpret_cast<__m256i*>(mas), ma);
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas), _mm256_unpacklo_epi8(lu, ma));
_mm256_store_si256(reinterpret_cast<__m256i*>(lumas)+1, _mm256_unpackhi_epi8(lu, ma));
_mm256_store_si256(reinterpret_cast<__m256i*>(hashs), hash);
if constexpr(Alpha)
if(!alpha)
Expand Down Expand Up @@ -973,9 +1007,9 @@ class qoi{
}
if(diffs[i])
*p++ = diffs[i];
else if(lus[i]){
*p++ = lus[i];
*p++ = mas[i];
else if(lumas[i*2]){
std::memcpy(p, lumas + i*2, 2);
p += 2;
}
else{
*p++ = chunk_tag::rgb;
Expand All @@ -987,7 +1021,13 @@ class qoi{
}
p_.advance(p-p_.raw_pointer());

encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
if constexpr(Alpha)
encode_body<Channels>(p_, pixels_, index, px_len, px, prev_hash, run);
else{
rgb_t px_prev;
efficient_memcpy<3>(&px_prev, &px);
encode_body<Channels>(p_, pixels_, index, px_len, px_prev, prev_hash, run);
}

push<sizeof(padding)>(p_, padding);
}
Expand Down