Skip to content

Commit

Permalink
GDALInterleave2Byte(): provide SSE2 implementation for ICC and MSVC
Browse files Browse the repository at this point in the history
  • Loading branch information
rouault committed Jan 5, 2025
1 parent e751efd commit 9881325
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 4 deletions.
16 changes: 15 additions & 1 deletion autotest/cpp/test_gdal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5011,7 +5011,21 @@ TEST_F(test_gdal, GDALTranspose2D_Byte_optims)
}
}

// Optim H = 2
// Optim H = 2 with W < 16
{
constexpr int W = 15;
constexpr int H = 2;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// Optim H = 2 with W >= 16
{
constexpr int W = 19;
constexpr int H = 2;
Expand Down
44 changes: 42 additions & 2 deletions gcore/rasterio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5746,6 +5746,44 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
/* GDALInterleave2Byte() */
/************************************************************************/

#if defined(HAVE_SSE2) && \
(!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))

// ICC autovectorizer doesn't do a good job at generating good SSE code,
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
#if defined(__GNUC__)
__attribute__((noinline))
#endif
static void
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
size_t i = 0;
constexpr size_t VALS_PER_ITER = 16;
for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
{
__m128i xmm0 =
_mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
__m128i xmm1 = _mm_loadu_si128(
reinterpret_cast<__m128i const *>(pSrc + i + nIters));
_mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
_mm_unpacklo_epi8(xmm0, xmm1));
_mm_storeu_si128(
reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
_mm_unpackhi_epi8(xmm0, xmm1));
}
#if defined(__clang__)
#pragma clang loop vectorize(disable)
#endif
for (; i < nIters; ++i)
{
pDst[2 * i + 0] = pSrc[i + 0 * nIters];
pDst[2 * i + 1] = pSrc[i + 1 * nIters];
}
}

#else

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("tree-vectorize")))
#endif
Expand All @@ -5756,7 +5794,7 @@ static void
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
#if defined(__clang__)
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
#pragma clang loop vectorize(enable)
#endif
for (size_t i = 0; i < nIters; ++i)
Expand All @@ -5766,6 +5804,8 @@ GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
}
}

#endif

/************************************************************************/
/* GDALInterleave4Byte() */
/************************************************************************/
Expand All @@ -5780,7 +5820,7 @@ static void
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
#if defined(__clang__)
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
#pragma clang loop vectorize(enable)
#endif
for (size_t i = 0; i < nIters; ++i)
Expand Down
2 changes: 1 addition & 1 deletion gcore/rasterio_ssse3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ static void
GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
#if defined(__clang__)
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
#pragma clang loop vectorize(enable)
#endif
for (size_t i = 0; i < nIters; ++i)
Expand Down

0 comments on commit 9881325

Please sign in to comment.