diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp
index 5b3289cf66ce..0a5a160c521a 100644
--- a/autotest/cpp/test_gdal.cpp
+++ b/autotest/cpp/test_gdal.cpp
@@ -4989,4 +4989,97 @@ TEST_F(test_gdal, GDALTranspose2D)
     }
 }
 
+TEST_F(test_gdal, GDALTranspose2D_Byte_optims)
+{
+    std::vector<GByte> in;
+    for (int i = 0; i < 19 * 17; ++i)
+        in.push_back(static_cast<GByte>(i % 256));
+
+    std::vector<GByte> out(in.size());
+
+    // SSSE3 optim (16x16) blocks
+    {
+        constexpr int W = 19;
+        constexpr int H = 17;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // Optim H = 2
+    {
+        constexpr int W = 19;
+        constexpr int H = 2;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // SSSE3 optim H = 3
+    {
+        constexpr int W = 19;
+        constexpr int H = 3;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // Optim H = 4
+    {
+        constexpr int W = 19;
+        constexpr int H = 4;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // SSSE3 optim H = 5 with W < 16
+    {
+        constexpr int W = 15;
+        constexpr int H = 5;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // SSSE3 optim H = 5 with W >= 16
+    {
+        constexpr int W = 19;
+        constexpr int H = 5;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+}
+
 }  // namespace
diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp
index 716e6bfac4a1..d4949efbf47e 100644
--- a/gcore/rasterio.cpp
+++ b/gcore/rasterio.cpp
@@ -5738,6 +5738,56 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
 #undef GDALTranspose2DComplex_internal
 }
 
+/************************************************************************/
+/*                      GDALInterleave2Byte()                           */
+/************************************************************************/
+
+#if defined(__GNUC__) && !defined(__clang__)
+__attribute__((optimize("tree-vectorize")))
+#endif
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
+                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
+{
+#if defined(__clang__)
+#pragma clang loop vectorize(enable)
+#endif
+    for (size_t i = 0; i < nIters; ++i)
+    {
+        pDst[2 * i + 0] = pSrc[i + 0 * nIters];
+        pDst[2 * i + 1] = pSrc[i + 1 * nIters];
+    }
+}
+
+/************************************************************************/
+/*                      GDALInterleave4Byte()                           */
+/************************************************************************/
+
+#if defined(__GNUC__) && !defined(__clang__)
+__attribute__((optimize("tree-vectorize")))
+#endif
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
+                    uint8_t *CPL_RESTRICT pDst, size_t nIters)
+{
+#if defined(__clang__)
+#pragma clang loop vectorize(enable)
+#endif
+    for (size_t i = 0; i < nIters; ++i)
+    {
+        pDst[4 * i + 0] = pSrc[i + 0 * nIters];
+        pDst[4 * i + 1] = pSrc[i + 1 * nIters];
+        pDst[4 * i + 2] = pSrc[i + 2 * nIters];
+        pDst[4 * i + 3] = pSrc[i + 3 * nIters];
+    }
+}
+
 /************************************************************************/
 /*                        GDALTranspose2D()                             */
 /************************************************************************/
@@ -5757,6 +5807,39 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
 void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
 {
+    if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
+    {
+        if (nSrcHeight == 2)
+        {
+            GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
+                                static_cast<uint8_t *>(pDst), nSrcWidth);
+            return;
+        }
+        if (nSrcHeight == 4)
+        {
+            GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
+                                static_cast<uint8_t *>(pDst), nSrcWidth);
+            return;
+        }
+#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
+     (defined(__x86_64) || defined(_M_X64)))
+        if (CPLHaveRuntimeSSSE3())
+        {
+            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
+                                       static_cast<uint8_t *>(pDst), nSrcWidth,
+                                       nSrcHeight);
+            return;
+        }
+#elif defined(USE_NEON_OPTIMIZATIONS)
+        {
+            GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
+                                       static_cast<uint8_t *>(pDst), nSrcWidth,
+                                       nSrcHeight);
+            return;
+        }
+#endif
+    }
+
 #define GDALTranspose2D_internal(DST_TYPE_CST, DST_TYPE, DST_IS_COMPLEX)       \
     case DST_TYPE_CST:                                                         \
         GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                             \
diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp
index fa9cd6ab24e4..e3f9636a7c27 100644
--- a/gcore/rasterio_ssse3.cpp
+++ b/gcore/rasterio_ssse3.cpp
@@ -12,6 +12,8 @@
 
 #include "cpl_port.h"
 
+#include <algorithm>
+
 #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
      (defined(__x86_64) || defined(_M_X64))) ||                                \
     defined(USE_NEON_OPTIMIZATIONS)
@@ -140,6 +142,35 @@ void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
 }
 #endif
 
+/************************************************************************/
+/*                     GDALTranspose4x4Int32()                          */
+/************************************************************************/
+
+// Consider that the input registers for 4x4 words of size 4 bytes each,
+// Return the transposition of this 4x4 matrix
+// Considering that in0 = (in00, in01, in02, in03)
+// Considering that in1 = (in10, in11, in12, in13)
+// Considering that in2 = (in20, in21, in22, in23)
+// Considering that in3 = (in30, in31, in32, in33)
+// Return          out0 = (in00, in10, in20, in30)
+// Return          out1 = (in01, in11, in21, in31)
+// Return          out2 = (in02, in12, in22, in32)
+// Return          out3 = (in03, in13, in23, in33)
+inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
+                                  __m128i in3, __m128i &out0, __m128i &out1,
+                                  __m128i &out2, __m128i &out3)
+{
+    __m128i tmp0 = _mm_unpacklo_epi32(in0, in1);  // (in00, in10, in01, in11)
+    __m128i tmp1 = _mm_unpackhi_epi32(in0, in1);  // (in02, in12, in03, in13)
+    __m128i tmp2 = _mm_unpacklo_epi32(in2, in3);  // (in20, in30, in21, in31)
+    __m128i tmp3 = _mm_unpackhi_epi32(in2, in3);  // (in22, in32, in23, in33)
+
+    out0 = _mm_unpacklo_epi64(tmp0, tmp2);  // (in00, in10, in20, in30)
+    out1 = _mm_unpackhi_epi64(tmp0, tmp2);  // (in01, in11, in21, in31)
+    out2 = _mm_unpacklo_epi64(tmp1, tmp3);  // (in02, in12, in22, in32)
+    out3 = _mm_unpackhi_epi64(tmp1, tmp3);  // (in03, in13, in23, in33)
+}
+
 /************************************************************************/
 /*                  GDALDeinterleave4Byte_SSSE3()                       */
 /************************************************************************/
@@ -169,14 +200,7 @@ void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
         xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
         xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
 
-        __m128i xmm01lo = _mm_unpacklo_epi32(xmm0, xmm1);  // W0 W4 W1 W5
-        __m128i xmm01hi = _mm_unpackhi_epi32(xmm0, xmm1);  // W2 W6 W3 W7
-        __m128i xmm23lo = _mm_unpacklo_epi32(xmm2, xmm3);  // W8 WC W9 WD
-        __m128i xmm23hi = _mm_unpackhi_epi32(xmm2, xmm3);  // WA WE WB WF
-        xmm0 = _mm_unpacklo_epi64(xmm01lo, xmm23lo);       // W0 W4 W8 WC
-        xmm1 = _mm_unpackhi_epi64(xmm01lo, xmm23lo);       // W1 W5 W9 WD
-        xmm2 = _mm_unpacklo_epi64(xmm01hi, xmm23hi);       // W2 W6 WA WE
-        xmm3 = _mm_unpackhi_epi64(xmm01hi, xmm23hi);       // W3 W7 WB WF
+        GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
 
         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
         _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
@@ -248,4 +272,362 @@ void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
 }
 #endif
 
+/************************************************************************/
+/*                      GDALInterleave3Byte_SSSE3()                     */
+/************************************************************************/
+
+#if defined(__GNUC__) && !defined(__clang__)
+__attribute__((optimize("tree-vectorize")))
+#endif
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                          uint8_t *CPL_RESTRICT pDst, size_t nIters)
+{
+#if defined(__clang__)
+#pragma clang loop vectorize(enable)
+#endif
+    for (size_t i = 0; i < nIters; ++i)
+    {
+        pDst[3 * i + 0] = pSrc[i + 0 * nIters];
+        pDst[3 * i + 1] = pSrc[i + 1 * nIters];
+        pDst[3 * i + 2] = pSrc[i + 2 * nIters];
+    }
+}
+
+/************************************************************************/
+/*                      GDALInterleave5Byte_SSSE3()                     */
+/************************************************************************/
+
+inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
+{
+    return _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
+}
+
+inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
+{
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
+}
+
+inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
+                                  __m128i r3, __m128i r4)
+{
+    return _mm_or_si128(
+        _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
+}
+
+void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                               uint8_t *CPL_RESTRICT pDst, size_t nIters)
+{
+    size_t i = 0;
+    constexpr size_t VALS_PER_ITER = 16;
+
+    if (nIters >= VALS_PER_ITER)
+    {
+        // clang-format off
+        constexpr char X = -1;
+        // How to dispatch 16 values of row=0 onto 5x16 bytes
+        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
+                                                    1, X, X, X, X,
+                                                    2, X, X, X, X,
+                                                    3);
+        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X, X, X,
+                                                    4, X, X, X, X,
+                                                    5, X, X, X, X,
+                                                    6, X);
+        const __m128i xmm_shuffle02 = _mm_setr_epi8(      X, X, X,
+                                                    7, X, X, X, X,
+                                                    8, X, X, X, X,
+                                                    9, X, X);
+        const __m128i xmm_shuffle03 = _mm_setr_epi8(          X, X,
+                                                    10, X, X, X, X,
+                                                    11, X, X, X, X,
+                                                    12, X, X, X);
+        const __m128i xmm_shuffle04 = _mm_setr_epi8(             X,
+                                                    13, X, X, X, X,
+                                                    14, X, X, X, X,
+                                                    15, X, X, X, X);
+
+        // How to dispatch 16 values of row=1 onto 5x16 bytes
+        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
+                                                    X, 1, X, X, X,
+                                                    X, 2, X, X, X,
+                                                    X);
+        const __m128i xmm_shuffle11 = _mm_setr_epi8(   3, X, X, X,
+                                                    X, 4, X, X, X,
+                                                    X, 5, X, X, X,
+                                                    X, 6);
+        const __m128i xmm_shuffle12 = _mm_setr_epi8(      X, X, X,
+                                                    X, 7, X, X, X,
+                                                    X, 8, X, X, X,
+                                                    X, 9, X);
+        const __m128i xmm_shuffle13 = _mm_setr_epi8(          X, X,
+                                                    X, 10, X, X, X,
+                                                    X, 11, X, X, X,
+                                                    X, 12, X, X);
+        const __m128i xmm_shuffle14 = _mm_setr_epi8(             X,
+                                                    X, 13, X, X, X,
+                                                    X, 14, X, X, X,
+                                                    X, 15, X, X, X);
+
+        // How to dispatch 16 values of row=2 onto 5x16 bytes
+        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
+                                                    X, X, 1, X, X,
+                                                    X, X, 2, X, X,
+                                                    X);
+        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 3, X, X,
+                                                    X, X, 4, X, X,
+                                                    X, X, 5, X, X,
+                                                    X, X);
+        const __m128i xmm_shuffle22 = _mm_setr_epi8(      6, X, X,
+                                                    X, X, 7, X, X,
+                                                    X, X, 8, X, X,
+                                                    X, X, 9);
+        const __m128i xmm_shuffle23 = _mm_setr_epi8(          X, X,
+                                                    X, X, 10, X, X,
+                                                    X, X, 11, X, X,
+                                                    X, X, 12, X);
+        const __m128i xmm_shuffle24 = _mm_setr_epi8(             X,
+                                                    X, X, 13, X, X,
+                                                    X, X, 14, X, X,
+                                                    X, X, 15, X, X);
+
+        // How to dispatch 16 values of row=3 onto 5x16 bytes
+        const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
+                                                    X, X, X, 1, X,
+                                                    X, X, X, 2, X,
+                                                    X);
+        const __m128i xmm_shuffle31 = _mm_setr_epi8(   X, X, 3, X,
+                                                    X, X, X, 4, X,
+                                                    X, X, X, 5, X,
+                                                    X, X);
+        const __m128i xmm_shuffle32 = _mm_setr_epi8(      X, 6, X,
+                                                    X, X, X, 7, X,
+                                                    X, X, X, 8, X,
+                                                    X, X, X);
+        const __m128i xmm_shuffle33 = _mm_setr_epi8(          9, X,
+                                                    X, X, X, 10, X,
+                                                    X, X, X, 11, X,
+                                                    X, X, X, 12);
+        const __m128i xmm_shuffle34 = _mm_setr_epi8(             X,
+                                                    X, X, X, 13, X,
+                                                    X, X, X, 14, X,
+                                                    X, X, X, 15, X);
+
+        // How to dispatch 16 values of row=4 onto 5x16 bytes
+        const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
+                                                    X, X, X, X, 1,
+                                                    X, X, X, X, 2,
+                                                    X);
+        const __m128i xmm_shuffle41 = _mm_setr_epi8(   X, X, X, 3,
+                                                    X, X, X, X, 4,
+                                                    X, X, X, X, 5,
+                                                    X, X);
+        const __m128i xmm_shuffle42 = _mm_setr_epi8(      X, X, 6,
+                                                    X, X, X, X, 7,
+                                                    X, X, X, X, 8,
+                                                    X, X, X);
+        const __m128i xmm_shuffle43 = _mm_setr_epi8(         X,  9,
+                                                    X, X, X, X, 10,
+                                                    X, X, X, X, 11,
+                                                    X, X, X, X);
+        const __m128i xmm_shuffle44 = _mm_setr_epi8(            12,
+                                                    X, X, X, X, 13,
+                                                    X, X, X, X, 14,
+                                                    X, X, X, X, 15);
+        // clang-format on
+
+        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
+        {
+#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
+            LOAD(0);
+            LOAD(1);
+            LOAD(2);
+            LOAD(3);
+            LOAD(4);
+
+#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
+#define COMBINE_5(x)                                                           \
+    GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2),            \
+                       SHUFFLE(x, 3), SHUFFLE(x, 4))
+
+#define STORE(x)                                                               \
+    storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
+            STORE(0);
+            STORE(1);
+            STORE(2);
+            STORE(3);
+            STORE(4);
+#undef LOAD
+#undef COMBINE_5
+#undef SHUFFLE
+#undef STORE
+        }
+    }
+
+    for (; i < nIters; ++i)
+    {
+#define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
+        INTERLEAVE(0);
+        INTERLEAVE(1);
+        INTERLEAVE(2);
+        INTERLEAVE(3);
+        INTERLEAVE(4);
+#undef INTERLEAVE
+    }
+}
+
+/************************************************************************/
+/*                      GDALTranspose2D_Byte_SSSE3()                    */
+/************************************************************************/
+
+// Given r = (b00, b01, b02, b03,
+//            b10, b11, b12, b13,
+//            b20, b21, b22, b23,
+//            b30, b31, b32, b33)
+// Return    (b00, b10, b20, b30,
+//            b01, b11, b21, b31,
+//            b02, b12, b22, b32,
+//            b03, b13, b22, b33)
+inline void GDALReorderForTranspose4x4(__m128i &r)
+{
+    const __m128i shuffle_mask =
+        _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+    r = _mm_shuffle_epi8(r, shuffle_mask);
+}
+
+// Transpose the 16x16 byte values contained in the 16 SSE registers
+inline void GDALTranspose16x16ByteBlock_SSSE3(
+    __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
+    __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
+    __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
+    __m128i &r15)
+{
+    __m128i tmp00, tmp01, tmp02, tmp03;
+    __m128i tmp10, tmp11, tmp12, tmp13;
+    __m128i tmp20, tmp21, tmp22, tmp23;
+    __m128i tmp30, tmp31, tmp32, tmp33;
+
+    GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
+    GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
+    GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
+    GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
+
+    GDALReorderForTranspose4x4(tmp00);
+    GDALReorderForTranspose4x4(tmp01);
+    GDALReorderForTranspose4x4(tmp02);
+    GDALReorderForTranspose4x4(tmp03);
+    GDALReorderForTranspose4x4(tmp10);
+    GDALReorderForTranspose4x4(tmp11);
+    GDALReorderForTranspose4x4(tmp12);
+    GDALReorderForTranspose4x4(tmp13);
+    GDALReorderForTranspose4x4(tmp20);
+    GDALReorderForTranspose4x4(tmp21);
+    GDALReorderForTranspose4x4(tmp22);
+    GDALReorderForTranspose4x4(tmp23);
+    GDALReorderForTranspose4x4(tmp30);
+    GDALReorderForTranspose4x4(tmp31);
+    GDALReorderForTranspose4x4(tmp32);
+    GDALReorderForTranspose4x4(tmp33);
+
+    GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
+    GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
+    GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
+    GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
+}
+
+inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                                           uint8_t *CPL_RESTRICT pDst,
+                                           size_t srcStride, size_t dstStride)
+{
+#define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
+    LOAD(0);
+    LOAD(1);
+    LOAD(2);
+    LOAD(3);
+    LOAD(4);
+    LOAD(5);
+    LOAD(6);
+    LOAD(7);
+    LOAD(8);
+    LOAD(9);
+    LOAD(10);
+    LOAD(11);
+    LOAD(12);
+    LOAD(13);
+    LOAD(14);
+    LOAD(15);
+#undef LOAD
+
+    GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
+                                      r10, r11, r12, r13, r14, r15);
+
+#define STORE(x) storeu(pDst, x, dstStride, r##x)
+    STORE(0);
+    STORE(1);
+    STORE(2);
+    STORE(3);
+    STORE(4);
+    STORE(5);
+    STORE(6);
+    STORE(7);
+    STORE(8);
+    STORE(9);
+    STORE(10);
+    STORE(11);
+    STORE(12);
+    STORE(13);
+    STORE(14);
+    STORE(15);
+#undef STORE
+}
+
+void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                                uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
+                                size_t nSrcHeight)
+{
+    if (nSrcHeight == 3)
+    {
+        GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
+    }
+    else if (nSrcHeight == 5)
+    {
+        GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
+    }
+    else
+    {
+        constexpr size_t blocksize = 16;
+        for (size_t i = 0; i < nSrcHeight; i += blocksize)
+        {
+            const size_t max_k = std::min(i + blocksize, nSrcHeight);
+            for (size_t j = 0; j < nSrcWidth; j += blocksize)
+            {
+                // transpose the block beginning at [i,j]
+                const size_t max_l = std::min(j + blocksize, nSrcWidth);
+                if (max_k - i == blocksize && max_l - j == blocksize)
+                {
+                    GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
+                                                   &pDst[i + j * nSrcHeight],
+                                                   nSrcWidth, nSrcHeight);
+                }
+                else
+                {
+                    for (size_t k = i; k < max_k; ++k)
+                    {
+                        for (size_t l = j; l < max_l; ++l)
+                        {
+                            GDALCopyWord(pSrc[l + k * nSrcWidth],
+                                         pDst[k + l * nSrcHeight]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 #endif  // HAVE_SSSE3_AT_COMPILE_TIME
diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h
index ac20e45c6c57..f720201842ac 100644
--- a/gcore/rasterio_ssse3.h
+++ b/gcore/rasterio_ssse3.h
@@ -54,6 +54,10 @@ void GDALDeinterleave4UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc,
                                    size_t nIters);
 #endif
 
+void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                                uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
+                                size_t nSrcHeight);
+
 #endif
 
 #endif /* RASTERIO_SSSE3_H_INCLUDED */
diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt
index a103013d3c9b..070894d812d0 100644
--- a/perftests/CMakeLists.txt
+++ b/perftests/CMakeLists.txt
@@ -30,3 +30,12 @@ if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
 endif()
 add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element)
 set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}")
+
+gdal_test_target(testperftranspose testperftranspose.cpp)
+if (HAVE_SSSE3_AT_COMPILE_TIME)
+  target_compile_definitions(testperftranspose PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME)
+endif()
+if (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    add_test(NAME testperftranspose COMMAND testperftranspose)
+    set_property(TEST testperftranspose PROPERTY ENVIRONMENT "${TEST_ENV}")
+endif()
diff --git a/perftests/testperftranspose.cpp b/perftests/testperftranspose.cpp
new file mode 100644
index 000000000000..504aa5da844f
--- /dev/null
+++ b/perftests/testperftranspose.cpp
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * $Id$
+ *
+ * Project:  GDAL Core
+ * Purpose:  Test performance of GDADLTranspose2D().
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "gdal.h"
+#include "cpl_conv.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define SIZE (1024 * 1024 + 1) * 100
+
+static void test(const void *pSrc, GDALDataType eSrcType, void *pDst,
+                 GDALDataType eDstType, int W, int H, int reducFactor,
+                 const char *extraMsg = "")
+{
+    CPLAssert(W * H <= SIZE);
+
+    const int niters =
+        static_cast<int>(4000U * 1000 * 1000 / reducFactor / W / H);
+    const auto start = clock();
+    for (int i = 0; i < niters; ++i)
+        GDALTranspose2D(pSrc, eSrcType, pDst, eDstType, W, H);
+    const auto end = clock();
+    printf("W=%d, H=%d, reducFactor=%d%s: %0.2f sec\n", W, H, reducFactor,
+           extraMsg, (end - start) * reducFactor * 1.0 / CLOCKS_PER_SEC);
+}
+
+int main(int /* argc */, char * /* argv */[])
+{
+    void *src = CPLCalloc(1, SIZE);
+    void *dst = CPLCalloc(1, SIZE);
+
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 2, 1);
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 3, 1);
+#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG)
+    {
+        CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false);
+        test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 3, 10,
+             " (no SSSE3)");
+    }
+#endif
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 4, 1);
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 5, 1);
+#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG)
+    {
+        CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false);
+        test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 5, 10,
+             " (no SSSE3)");
+    }
+#endif
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 16 + 1, 10);
+#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG)
+    {
+        CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false);
+        test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 16 + 1, 10,
+             " (no SSSE3)");
+    }
+#endif
+    test(src, GDT_Byte, dst, GDT_Byte, 1024 * 1024 + 1, 100, 10);
+    test(src, GDT_Byte, dst, GDT_Byte, 70 * 1024 + 1, 1024 + 1, 10);
+#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG)
+    {
+        CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false);
+        test(src, GDT_Byte, dst, GDT_Byte, 70 * 1024 + 1, 1024 + 1, 10,
+             " (no SSSE3)");
+    }
+#endif
+    test(src, GDT_Byte, dst, GDT_Byte, 7 * 1024 + 1, 10 * 1024 + 1, 10);
+#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && defined(DEBUG)
+    {
+        CPLConfigOptionSetter oSetters("GDAL_USE_SSSE3", "NO", false);
+        test(src, GDT_Byte, dst, GDT_Byte, 7 * 1024 + 1, 10 * 1024 + 1, 10,
+             " (no SSSE3)");
+    }
+#endif
+
+    VSIFree(src);
+    VSIFree(dst);
+    return 0;
+}