diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp index 0a5a160c521a..4103f7ee891e 100644 --- a/autotest/cpp/test_gdal.cpp +++ b/autotest/cpp/test_gdal.cpp @@ -5011,7 +5011,21 @@ TEST_F(test_gdal, GDALTranspose2D_Byte_optims) } } - // Optim H = 2 + // Optim H = 2 with W < 16 + { + constexpr int W = 15; + constexpr int H = 2; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // Optim H = 2 with W >= 16 { constexpr int W = 19; constexpr int H = 2; diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp index 8e048fee8e2b..d904d52e4b30 100644 --- a/gcore/rasterio.cpp +++ b/gcore/rasterio.cpp @@ -5746,6 +5746,44 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst, /* GDALInterleave2Byte() */ /************************************************************************/ +#if defined(HAVE_SSE2) && \ + (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER)) + +// ICC autovectorizer doesn't do a good job at generating good SSE code, +// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop. +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nIters) +{ + size_t i = 0; + constexpr size_t VALS_PER_ITER = 16; + for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER) + { + __m128i xmm0 = + _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i)); + __m128i xmm1 = _mm_loadu_si128( + reinterpret_cast<__m128i const *>(pSrc + i + nIters)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i), + _mm_unpacklo_epi8(xmm0, xmm1)); + _mm_storeu_si128( + reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER), + _mm_unpackhi_epi8(xmm0, xmm1)); + } +#if defined(__clang__) +#pragma clang loop vectorize(disable) +#endif + for (; i < nIters; ++i) + { + pDst[2 * i + 0] = pSrc[i + 0 * nIters]; + pDst[2 * i + 1] = pSrc[i + 1 * nIters]; + } +} + +#else + #if defined(__GNUC__) && !defined(__clang__) __attribute__((optimize("tree-vectorize"))) #endif @@ -5756,7 +5794,7 @@ static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc, uint8_t *CPL_RESTRICT pDst, size_t nIters) { -#if defined(__clang__) +#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) #pragma clang loop vectorize(enable) #endif for (size_t i = 0; i < nIters; ++i) @@ -5766,6 +5804,8 @@ GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc, } } +#endif + /************************************************************************/ /* GDALInterleave4Byte() */ /************************************************************************/ @@ -5780,7 +5820,7 @@ static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc, uint8_t *CPL_RESTRICT pDst, size_t nIters) { -#if defined(__clang__) +#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) #pragma clang loop vectorize(enable) #endif for (size_t i = 0; i < nIters; ++i) diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp index e3f9636a7c27..b09fb4726fc8 100644 --- a/gcore/rasterio_ssse3.cpp +++ b/gcore/rasterio_ssse3.cpp @@ -286,7 +286,7 @@ static void GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, uint8_t *CPL_RESTRICT pDst, size_t nIters) { -#if defined(__clang__) +#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) #pragma clang loop vectorize(enable) #endif for (size_t i = 0; i < nIters; ++i)