Skip to content

Commit

Permalink
WIP avx2
Browse files Browse the repository at this point in the history
  • Loading branch information
niklaspandersson committed Jun 19, 2024
1 parent 00cb6c1 commit 630bf36
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 10 deletions.
2 changes: 2 additions & 0 deletions src/CMakeModules/Bootstrap_Linux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ endif()
IF (CMAKE_SYSTEM_PROCESSOR MATCHES "(i[3-6]86|x64|x86_64|amd64|e2k)")
ADD_COMPILE_OPTIONS (-msse3)
ADD_COMPILE_OPTIONS (-mssse3)
ADD_COMPILE_OPTIONS (-mavx)
ADD_COMPILE_OPTIONS (-msse4.1)
ADD_COMPILE_OPTIONS (-mavx2)
ELSE ()
ADD_COMPILE_DEFINITIONS (USE_SIMDE)
ENDIF ()
Expand Down
65 changes: 55 additions & 10 deletions src/modules/decklink/consumer/frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,16 @@ int get_row_bytes(BMDPixelFormat pix_fmt, int width)
return width * 4;
}

inline unsigned int pack_pixel(__m128i pixel) {
inline unsigned int pack_pixel(__m128i pixel)
{
// Scale down to 10 bit and convert to video range to get a valid
// v210 value after the decklink conversion
// formula: scaled_channel = (src >> 6) * 876 / 1024 + 64;

__m128i bit32 = _mm_unpacklo_epi16(pixel, _mm_setzero_si128()); // unpack 16 bit components to 32 bit
__m128i bit10 = _mm_srli_epi32(bit32, 6); // shift down to 10 bit precision
__m128i bit10 = _mm_srli_epi32(bit32, 6); // shift down to 10 bit precision
bit10 = _mm_mullo_epi32(bit10, _mm_set1_epi32(876)); // multiply by 876
bit10 = _mm_srli_epi32(bit10, 10); // divide by 1024
bit10 = _mm_srli_epi32(bit10, 10); // divide by 1024
bit10 = _mm_add_epi32(bit10, _mm_set1_epi32(64)); // add 64

// Extract the 10 bit components and save to dest
Expand Down Expand Up @@ -106,21 +107,65 @@ void convert_frame(const core::video_format_desc& channel_format_desc,
size_t byte_count_line = get_row_bytes(bmdFormat10BitRGBXLE, decklink_format_desc.width);
tbb::parallel_for(0, NUM_THREADS, [&](int i) {
auto end = (i + 1) * rows_per_thread;
//__m128i zero = _mm_setzero_si128();
__m256i zero = _mm256_setzero_si256();
//__m128i fac = _mm_set1_epi32(876);
__m256i fac = _mm256_set1_epi32(876);
//__m128i offset = _mm_set1_epi32(64);
__m256i offset = _mm256_set1_epi32(64);
for (int y = firstLine + i * rows_per_thread; y < end; y += decklink_format_desc.field_count) {
auto dest = reinterpret_cast<uint32_t*>(image_data.get()) + (long long)y * byte_count_line / 4;
__m128i zero = _mm_setzero_si128();
__m128i fac = _mm_set1_epi32(876);
__m128i offset = _mm_set1_epi32(64);

for (int x = 0; x < decklink_format_desc.width; x += 2) {
for (int x = 0; x < decklink_format_desc.width; x += 4) {
auto src = reinterpret_cast<const uint16_t*>(
frame.image_data(0).data() + (long long)y * decklink_format_desc.width * 8 + x * 8);

// SIMD optimized
// Load two pixels at once to stay on 16-byte aligned memory
__m128i pixels = _mm_load_si128(reinterpret_cast<const __m128i*>(src));
dest[x] = pack_pixel(_mm_unpacklo_epi64(pixels, zero));
dest[x + 1] = pack_pixel(_mm_unpackhi_epi64(pixels, zero));
//__m128i pixels = _mm_load_si128(reinterpret_cast<const __m128i*>(src));

// Load four pixels at once (16x4 = 64, 64 x 4 = 256 bytes)
__m256i pixels = _mm256_load_si256(reinterpret_cast<const __m256i*>(src));

//__m128i pixel1 = _mm_unpacklo_epi64(pixels, zero);
// pixel1 = _mm_unpacklo_epi16(pixel1, zero); // unpack 16 bit components to 32 bit
__m256i pixel13 = _mm256_unpacklo_epi16(pixels, zero);
__m256i pixel24 = _mm256_unpackhi_epi16(pixels, zero);

// pixel1 = _mm_srli_epi32(pixel1, 6); // shift down to 10 bit precision
pixel13 = _mm256_srli_epi32(pixel13, 6); // shift down to 10 bit precision
pixel24 = _mm256_srli_epi32(pixel24, 6); // shift down to 10 bit precision

// pixel1 = _mm_mullo_epi32(pixel1, _mm_set1_epi32(876)); // multiply by 876
pixel13 = _mm256_mullo_epi32(pixel13, fac); // multiply by 876
pixel24 = _mm256_mullo_epi32(pixel24, fac); // multiply by 876

// pixel1 = _mm_srli_epi32(pixel1, 10); // divide by 1024
pixel13 = _mm256_srli_epi32(pixel13, 10); // divide by 1024
pixel24 = _mm256_srli_epi32(pixel24, 10); // divide by 1024

// pixel1 = _mm_add_epi32(pixel1, _mm_set1_epi32(64)); // add 64
pixel13 = _mm256_add_epi32(pixel13, offset); // add 64
pixel24 = _mm256_add_epi32(pixel24, offset); // add 64

__m256i red_green = _mm256_unpacklo_epi32(pixel13, pixel24);
__m256i blue_alpha = _mm256_unpackhi_epi32(pixel13, pixel24);

__m128i low = _mm256_extracti128_si256(red_green, 0);
__m128i high = _mm256_extracti128_si256(red_green, 1);
__m128i red = _mm_unpacklo_epi64(low, high);
__m128i green = _mm_unpackhi_epi64(low, high);

low = _mm256_extracti128_si256(blue_alpha, 0);
high = _mm256_extracti128_si256(blue_alpha, 1);
__m128i blue = _mm_unpacklo_epi64(low, high);

red = _mm_slli_epi32(red, 22);
green = _mm_slli_epi32(green, 12);
blue = _mm_slli_epi32(blue, 2);

__m128i result = _mm_add_epi32(_mm_add_epi32(red, green), blue);
_mm_store_si128(reinterpret_cast<__m128i*>(dest[x]), result);
}
}
});
Expand Down

0 comments on commit 630bf36

Please sign in to comment.