Skip to content

Commit

Permalink
GDALTranspose2D(): add SSSE3 optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
rouault committed Jan 4, 2025
1 parent dc7345a commit 1e4b1bc
Show file tree
Hide file tree
Showing 6 changed files with 655 additions and 8 deletions.
93 changes: 93 additions & 0 deletions autotest/cpp/test_gdal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4989,4 +4989,97 @@ TEST_F(test_gdal, GDALTranspose2D)
}
}

TEST_F(test_gdal, GDALTranspose2D_Byte_optims)
{
std::vector<GByte> in;
for (int i = 0; i < 19 * 17; ++i)
in.push_back(static_cast<GByte>(i % 256));

std::vector<GByte> out(in.size());

// SSSE3 optim (16x16) blocks
{
constexpr int W = 19;
constexpr int H = 17;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// Optim H = 2
{
constexpr int W = 19;
constexpr int H = 2;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// SSSE3 optim H = 3
{
constexpr int W = 19;
constexpr int H = 3;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// Optim H = 4
{
constexpr int W = 19;
constexpr int H = 4;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// SSSE3 optim H = 5 with W < 16
{
constexpr int W = 15;
constexpr int H = 5;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}

// SSSE3 optim H = 5 with W >= 16
{
constexpr int W = 19;
constexpr int H = 5;
GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
for (int y = 0; y < H; ++y)
{
for (int x = 0; x < W; ++x)
{
EXPECT_EQ(out[x * H + y], in[y * W + x]);
}
}
}
}

} // namespace
75 changes: 75 additions & 0 deletions gcore/rasterio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5738,6 +5738,56 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
#undef GDALTranspose2DComplex_internal
}

/************************************************************************/
/* GDALInterleave2Byte() */
/************************************************************************/

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("tree-vectorize")))
#endif
#if defined(__GNUC__)
__attribute__((noinline))
#endif
static void
GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
#if defined(__clang__)
#pragma clang loop vectorize(enable)
#endif
for (size_t i = 0; i < nIters; ++i)
{
pDst[2 * i + 0] = pSrc[i + 0 * nIters];
pDst[2 * i + 1] = pSrc[i + 1 * nIters];
}
}

/************************************************************************/
/* GDALInterleave4Byte() */
/************************************************************************/

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("tree-vectorize")))
#endif
#if defined(__GNUC__)
__attribute__((noinline))
#endif
static void
GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
uint8_t *CPL_RESTRICT pDst, size_t nIters)
{
#if defined(__clang__)
#pragma clang loop vectorize(enable)
#endif
for (size_t i = 0; i < nIters; ++i)
{
pDst[4 * i + 0] = pSrc[i + 0 * nIters];
pDst[4 * i + 1] = pSrc[i + 1 * nIters];
pDst[4 * i + 2] = pSrc[i + 2 * nIters];
pDst[4 * i + 3] = pSrc[i + 3 * nIters];
}
}

/************************************************************************/
/* GDALTranspose2D() */
/************************************************************************/
Expand All @@ -5757,6 +5807,31 @@ static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
{
if (eSrcType == GDT_Byte && eDstType == GDT_Byte)
{
if (nSrcHeight == 2)
{
GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
static_cast<uint8_t *>(pDst), nSrcWidth);
return;
}
if (nSrcHeight == 4)
{
GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
static_cast<uint8_t *>(pDst), nSrcWidth);
return;
}
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
if (CPLHaveRuntimeSSSE3())
{
GDALTranspose2DSingleToSingle_SSSE3(
static_cast<const uint8_t *>(pSrc),
static_cast<uint8_t *>(pDst), nSrcWidth, nSrcHeight);
return;
}
#endif
}

#define GDALTranspose2D_internal(DST_TYPE_CST, DST_TYPE, DST_IS_COMPLEX) \
case DST_TYPE_CST: \
GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>( \
Expand Down
Loading

0 comments on commit 1e4b1bc

Please sign in to comment.