diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp index 5b3289cf66ce..0a5a160c521a 100644 --- a/autotest/cpp/test_gdal.cpp +++ b/autotest/cpp/test_gdal.cpp @@ -4989,4 +4989,97 @@ TEST_F(test_gdal, GDALTranspose2D) } } +TEST_F(test_gdal, GDALTranspose2D_Byte_optims) +{ + std::vector in; + for (int i = 0; i < 19 * 17; ++i) + in.push_back(static_cast(i % 256)); + + std::vector out(in.size()); + + // SSSE3 optim (16x16) blocks + { + constexpr int W = 19; + constexpr int H = 17; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // Optim H = 2 + { + constexpr int W = 19; + constexpr int H = 2; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // SSSE3 optim H = 3 + { + constexpr int W = 19; + constexpr int H = 3; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // Optim H = 4 + { + constexpr int W = 19; + constexpr int H = 4; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // SSSE3 optim H = 5 with W < 16 + { + constexpr int W = 15; + constexpr int H = 5; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } + + // SSSE3 optim H = 5 with W >= 16 + { + constexpr int W = 19; + constexpr int H = 5; + GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H); + for (int y = 0; y < H; ++y) + { + for (int x = 0; x < W; ++x) + { + EXPECT_EQ(out[x * H + y], in[y * W + x]); + } + } + } +} + } // namespace diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp index 716e6bfac4a1..d4949efbf47e 100644 --- a/gcore/rasterio.cpp +++ b/gcore/rasterio.cpp @@ -5738,6 +5738,56 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst, #undef GDALTranspose2DComplex_internal } +/************************************************************************/ +/* GDALInterleave2Byte() */ +/************************************************************************/ + +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("tree-vectorize"))) +#endif +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nIters) +{ +#if defined(__clang__) +#pragma clang loop vectorize(enable) +#endif + for (size_t i = 0; i < nIters; ++i) + { + pDst[2 * i + 0] = pSrc[i + 0 * nIters]; + pDst[2 * i + 1] = pSrc[i + 1 * nIters]; + } +} + +/************************************************************************/ +/* GDALInterleave4Byte() */ +/************************************************************************/ + +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("tree-vectorize"))) +#endif +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nIters) +{ +#if defined(__clang__) +#pragma clang loop vectorize(enable) +#endif + for (size_t i = 0; i < nIters; ++i) + { + pDst[4 * i + 0] = pSrc[i + 0 * nIters]; + pDst[4 * i + 1] = pSrc[i + 1 * nIters]; + pDst[4 * i + 2] = pSrc[i + 2 * nIters]; + pDst[4 * i + 3] = pSrc[i + 3 * nIters]; + } +} + /************************************************************************/ /* GDALTranspose2D() */ /************************************************************************/ @@ -5757,6 +5807,39 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst, void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst, GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight) { + if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8)) + { + if (nSrcHeight == 2) + { + GDALInterleave2Byte(static_cast(pSrc), + static_cast(pDst), nSrcWidth); + return; + } + if (nSrcHeight == 4) + { + GDALInterleave4Byte(static_cast(pSrc), + static_cast(pDst), nSrcWidth); + return; + } +#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ + (defined(__x86_64) || defined(_M_X64))) + if (CPLHaveRuntimeSSSE3()) + { + GDALTranspose2D_Byte_SSSE3(static_cast(pSrc), + static_cast(pDst), nSrcWidth, + nSrcHeight); + return; + } +#elif defined(USE_NEON_OPTIMIZATIONS) + { + GDALTranspose2D_Byte_SSSE3(static_cast(pSrc), + static_cast(pDst), nSrcWidth, + nSrcHeight); + return; + } +#endif + } + #define GDALTranspose2D_internal(DST_TYPE_CST, DST_TYPE, DST_IS_COMPLEX) \ case DST_TYPE_CST: \ GDALTranspose2D( \ diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp index fa9cd6ab24e4..e3f9636a7c27 100644 --- a/gcore/rasterio_ssse3.cpp +++ b/gcore/rasterio_ssse3.cpp @@ -12,6 +12,8 @@ #include "cpl_port.h" +#include + #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ (defined(__x86_64) || defined(_M_X64))) || \ defined(USE_NEON_OPTIMIZATIONS) @@ -140,6 +142,35 @@ void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc, } #endif +/************************************************************************/ +/* GDALTranspose4x4Int32() */ +/************************************************************************/ + +// Consider that the input registers for 4x4 words of size 4 bytes each, +// Return the transposition of this 4x4 matrix +// Considering that in0 = (in00, in01, in02, in03) +// Considering that in1 = (in10, in11, in12, in13) +// Considering that in2 = (in20, in21, in22, in23) +// Considering that in3 = (in30, in31, in32, in33) +// Return out0 = (in00, in10, in20, in30) +// Return out1 = (in01, in11, in21, in31) +// Return out2 = (in02, in12, in22, in32) +// Return out3 = (in03, in13, in23, in33) +inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2, + __m128i in3, __m128i &out0, __m128i &out1, + __m128i &out2, __m128i &out3) +{ + __m128i tmp0 = _mm_unpacklo_epi32(in0, in1); // (in00, in10, in01, in11) + __m128i tmp1 = _mm_unpackhi_epi32(in0, in1); // (in02, in12, in03, in13) + __m128i tmp2 = _mm_unpacklo_epi32(in2, in3); // (in20, in30, in21, in31) + __m128i tmp3 = _mm_unpackhi_epi32(in2, in3); // (in22, in32, in23, in33) + + out0 = _mm_unpacklo_epi64(tmp0, tmp2); // (in00, in10, in20, in30) + out1 = _mm_unpackhi_epi64(tmp0, tmp2); // (in01, in11, in21, in31) + out2 = _mm_unpacklo_epi64(tmp1, tmp3); // (in02, in12, in22, in32) + out3 = _mm_unpackhi_epi64(tmp1, tmp3); // (in03, in13, in23, in33) +} + /************************************************************************/ /* GDALDeinterleave4Byte_SSSE3() */ /************************************************************************/ @@ -169,14 +200,7 @@ void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc, xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF - __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1); // W0 W4 W1 W5 - __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1); // W2 W6 W3 W7 - __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3); // W8 WC W9 WD - __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3); // WA WE WB WF - xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC - xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD - xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE - xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi); // W3 W7 WB WF + GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3); _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0); _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1); @@ -248,4 +272,362 @@ void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc, } #endif +/************************************************************************/ +/* GDALInterleave3Byte_SSSE3() */ +/************************************************************************/ + +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("tree-vectorize"))) +#endif +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nIters) +{ +#if defined(__clang__) +#pragma clang loop vectorize(enable) +#endif + for (size_t i = 0; i < nIters; ++i) + { + pDst[3 * i + 0] = pSrc[i + 0 * nIters]; + pDst[3 * i + 1] = pSrc[i + 1 * nIters]; + pDst[3 * i + 2] = pSrc[i + 2 * nIters]; + } +} + +/************************************************************************/ +/* GDALInterleave5Byte_SSSE3() */ +/************************************************************************/ + +inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride) +{ + return _mm_loadu_si128( + reinterpret_cast(pSrc + i * srcStride)); +} + +inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg) +{ + _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg); +} + +inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2, + __m128i r3, __m128i r4) +{ + return _mm_or_si128( + _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4); +} + +void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nIters) +{ + size_t i = 0; + constexpr size_t VALS_PER_ITER = 16; + + if (nIters >= VALS_PER_ITER) + { + // clang-format off + constexpr char X = -1; + // How to dispatch 16 values of row=0 onto 5x16 bytes + const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X, + 1, X, X, X, X, + 2, X, X, X, X, + 3); + const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X, X, X, + 4, X, X, X, X, + 5, X, X, X, X, + 6, X); + const __m128i xmm_shuffle02 = _mm_setr_epi8( X, X, X, + 7, X, X, X, X, + 8, X, X, X, X, + 9, X, X); + const __m128i xmm_shuffle03 = _mm_setr_epi8( X, X, + 10, X, X, X, X, + 11, X, X, X, X, + 12, X, X, X); + const __m128i xmm_shuffle04 = _mm_setr_epi8( X, + 13, X, X, X, X, + 14, X, X, X, X, + 15, X, X, X, X); + + // How to dispatch 16 values of row=1 onto 5x16 bytes + const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X, + X, 1, X, X, X, + X, 2, X, X, X, + X); + const __m128i xmm_shuffle11 = _mm_setr_epi8( 3, X, X, X, + X, 4, X, X, X, + X, 5, X, X, X, + X, 6); + const __m128i xmm_shuffle12 = _mm_setr_epi8( X, X, X, + X, 7, X, X, X, + X, 8, X, X, X, + X, 9, X); + const __m128i xmm_shuffle13 = _mm_setr_epi8( X, X, + X, 10, X, X, X, + X, 11, X, X, X, + X, 12, X, X); + const __m128i xmm_shuffle14 = _mm_setr_epi8( X, + X, 13, X, X, X, + X, 14, X, X, X, + X, 15, X, X, X); + + // How to dispatch 16 values of row=2 onto 5x16 bytes + const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X, + X, X, 1, X, X, + X, X, 2, X, X, + X); + const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 3, X, X, + X, X, 4, X, X, + X, X, 5, X, X, + X, X); + const __m128i xmm_shuffle22 = _mm_setr_epi8( 6, X, X, + X, X, 7, X, X, + X, X, 8, X, X, + X, X, 9); + const __m128i xmm_shuffle23 = _mm_setr_epi8( X, X, + X, X, 10, X, X, + X, X, 11, X, X, + X, X, 12, X); + const __m128i xmm_shuffle24 = _mm_setr_epi8( X, + X, X, 13, X, X, + X, X, 14, X, X, + X, X, 15, X, X); + + // How to dispatch 16 values of row=3 onto 5x16 bytes + const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X, + X, X, X, 1, X, + X, X, X, 2, X, + X); + const __m128i xmm_shuffle31 = _mm_setr_epi8( X, X, 3, X, + X, X, X, 4, X, + X, X, X, 5, X, + X, X); + const __m128i xmm_shuffle32 = _mm_setr_epi8( X, 6, X, + X, X, X, 7, X, + X, X, X, 8, X, + X, X, X); + const __m128i xmm_shuffle33 = _mm_setr_epi8( 9, X, + X, X, X, 10, X, + X, X, X, 11, X, + X, X, X, 12); + const __m128i xmm_shuffle34 = _mm_setr_epi8( X, + X, X, X, 13, X, + X, X, X, 14, X, + X, X, X, 15, X); + + // How to dispatch 16 values of row=4 onto 5x16 bytes + const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0, + X, X, X, X, 1, + X, X, X, X, 2, + X); + const __m128i xmm_shuffle41 = _mm_setr_epi8( X, X, X, 3, + X, X, X, X, 4, + X, X, X, X, 5, + X, X); + const __m128i xmm_shuffle42 = _mm_setr_epi8( X, X, 6, + X, X, X, X, 7, + X, X, X, X, 8, + X, X, X); + const __m128i xmm_shuffle43 = _mm_setr_epi8( X, 9, + X, X, X, X, 10, + X, X, X, X, 11, + X, X, X, X); + const __m128i xmm_shuffle44 = _mm_setr_epi8( 12, + X, X, X, X, 13, + X, X, X, X, 14, + X, X, X, X, 15); + // clang-format on + + for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER) + { +#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters) + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + LOAD(4); + +#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x) +#define COMBINE_5(x) \ + GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2), \ + SHUFFLE(x, 3), SHUFFLE(x, 4)) + +#define STORE(x) \ + storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x)) + STORE(0); + STORE(1); + STORE(2); + STORE(3); + STORE(4); +#undef LOAD +#undef COMBINE_5 +#undef SHUFFLE +#undef STORE + } + } + + for (; i < nIters; ++i) + { +#define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters] + INTERLEAVE(0); + INTERLEAVE(1); + INTERLEAVE(2); + INTERLEAVE(3); + INTERLEAVE(4); +#undef INTERLEAVE + } +} + +/************************************************************************/ +/* GDALTranspose2D_Byte_SSSE3() */ +/************************************************************************/ + +// Given r = (b00, b01, b02, b03, +// b10, b11, b12, b13, +// b20, b21, b22, b23, +// b30, b31, b32, b33) +// Return (b00, b10, b20, b30, +// b01, b11, b21, b31, +// b02, b12, b22, b32, +// b03, b13, b22, b33) +inline void GDALReorderForTranspose4x4(__m128i &r) +{ + const __m128i shuffle_mask = + _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + r = _mm_shuffle_epi8(r, shuffle_mask); +} + +// Transpose the 16x16 byte values contained in the 16 SSE registers +inline void GDALTranspose16x16ByteBlock_SSSE3( + __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04, + __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09, + __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14, + __m128i &r15) +{ + __m128i tmp00, tmp01, tmp02, tmp03; + __m128i tmp10, tmp11, tmp12, tmp13; + __m128i tmp20, tmp21, tmp22, tmp23; + __m128i tmp30, tmp31, tmp32, tmp33; + + GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03); + GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13); + GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23); + GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33); + + GDALReorderForTranspose4x4(tmp00); + GDALReorderForTranspose4x4(tmp01); + GDALReorderForTranspose4x4(tmp02); + GDALReorderForTranspose4x4(tmp03); + GDALReorderForTranspose4x4(tmp10); + GDALReorderForTranspose4x4(tmp11); + GDALReorderForTranspose4x4(tmp12); + GDALReorderForTranspose4x4(tmp13); + GDALReorderForTranspose4x4(tmp20); + GDALReorderForTranspose4x4(tmp21); + GDALReorderForTranspose4x4(tmp22); + GDALReorderForTranspose4x4(tmp23); + GDALReorderForTranspose4x4(tmp30); + GDALReorderForTranspose4x4(tmp31); + GDALReorderForTranspose4x4(tmp32); + GDALReorderForTranspose4x4(tmp33); + + GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03); + GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07); + GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11); + GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15); +} + +inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, + size_t srcStride, size_t dstStride) +{ +#define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride) + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + LOAD(4); + LOAD(5); + LOAD(6); + LOAD(7); + LOAD(8); + LOAD(9); + LOAD(10); + LOAD(11); + LOAD(12); + LOAD(13); + LOAD(14); + LOAD(15); +#undef LOAD + + GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, + r10, r11, r12, r13, r14, r15); + +#define STORE(x) storeu(pDst, x, dstStride, r##x) + STORE(0); + STORE(1); + STORE(2); + STORE(3); + STORE(4); + STORE(5); + STORE(6); + STORE(7); + STORE(8); + STORE(9); + STORE(10); + STORE(11); + STORE(12); + STORE(13); + STORE(14); + STORE(15); +#undef STORE +} + +void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth, + size_t nSrcHeight) +{ + if (nSrcHeight == 3) + { + GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth); + } + else if (nSrcHeight == 5) + { + GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth); + } + else + { + constexpr size_t blocksize = 16; + for (size_t i = 0; i < nSrcHeight; i += blocksize) + { + const size_t max_k = std::min(i + blocksize, nSrcHeight); + for (size_t j = 0; j < nSrcWidth; j += blocksize) + { + // transpose the block beginning at [i,j] + const size_t max_l = std::min(j + blocksize, nSrcWidth); + if (max_k - i == blocksize && max_l - j == blocksize) + { + GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth], + &pDst[i + j * nSrcHeight], + nSrcWidth, nSrcHeight); + } + else + { + for (size_t k = i; k < max_k; ++k) + { + for (size_t l = j; l < max_l; ++l) + { + GDALCopyWord(pSrc[l + k * nSrcWidth], + pDst[k + l * nSrcHeight]); + } + } + } + } + } + } +} + #endif // HAVE_SSSE3_AT_COMPILE_TIME diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h index ac20e45c6c57..f720201842ac 100644 --- a/gcore/rasterio_ssse3.h +++ b/gcore/rasterio_ssse3.h @@ -54,6 +54,10 @@ void GDALDeinterleave4UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc, size_t nIters); #endif +void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, + uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth, + size_t nSrcHeight); + #endif #endif /* RASTERIO_SSSE3_H_INCLUDED */ diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt index a103013d3c9b..070894d812d0 100644 --- a/perftests/CMakeLists.txt +++ b/perftests/CMakeLists.txt @@ -30,3 +30,12 @@ if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) endif() add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element) set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}") + +gdal_test_target(testperftranspose testperftranspose.cpp) +if (HAVE_SSSE3_AT_COMPILE_TIME) + target_compile_definitions(testperftranspose PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME) +endif() +if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + add_test(NAME testperftranspose COMMAND testperftranspose) + set_property(TEST testperftranspose PROPERTY ENVIRONMENT "${TEST_ENV}") +endif() diff --git a/perftests/testperftranspose.cpp b/perftests/testperftranspose.cpp new file mode 100644 index 000000000000..504aa5da844f --- /dev/null +++ b/perftests/testperftranspose.cpp @@ -0,0 +1,91 @@ +/****************************************************************************** + * $Id$ + * + * Project: GDAL Core + * Purpose: Test performance of GDADLTranspose2D(). + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2025, Even Rouault + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#include "gdal.h" +#include "cpl_conv.h" + +#include +#include +#include + +#define SIZE (1024 * 1024 + 1) * 100 + +static void test(const void *pSrc, GDALDataType eSrcType, void *pDst, + GDALDataType eDstType, int W, int H, int reducFactor, + const char *extraMsg = "") +{ + CPLAssert(W * H <= SIZE); + + const int niters = + static_cast(4000U * 1000 * 1000 / reducFactor / W / H); + const auto start = clock(); + for (int i = 0; i < niters; ++i) + GDALTranspose2D(pSrc, eSrcType, pDst, eDstType, W, H); + const auto end = clock(); + printf("W=%d, H=%d, reducFactor=%d%s: %0.2f sec\n", W, H, reducFactor, + extraMsg, (end - start) * reducFactor * 1.0 / CLOCKS_PER_SEC); +} + +int main(int /* argc */, char * /* argv */[]) +{ + void *src = CPLCalloc(1, SIZE); + void *dst = CPLCalloc(1, SIZE); + + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 2, 1); + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 3, 1); +#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG) + { + CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false); + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 3, 10, + " (no SSSE3)"); + } +#endif + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 4, 1); + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 5, 1); +#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG) + { + CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false); + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 5, 10, + " (no SSSE3)"); + } +#endif + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 16 + 1, 10); +#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG) + { + CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false); + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 16 + 1, 10, + " (no SSSE3)"); + } +#endif + test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 100, 10); + test(src, GDT_Byte, dst, GDT_Byte, 70 * 1024 + 1, 1024 + 1, 10); +#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG) + { + CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false); + test(src, GDT_Byte, dst, GDT_Byte, 70 * 1024 + 1, 1024 + 1, 10, + " (no SSSE3)"); + } +#endif + test(src, GDT_Byte, dst, GDT_Byte, 7 * 1024 + 1, 10 * 1024 + 1, 10); +#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG) + { + CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false); + test(src, GDT_Byte, dst, GDT_Byte, 7 * 1024 + 1, 10 * 1024 + 1, 10, + " (no SSSE3)"); + } +#endif + + VSIFree(src); + VSIFree(dst); + return 0; +}