From 1bbaba4103dba5879017e289745f6507f403c9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 15 Apr 2022 00:35:22 +0200 Subject: [PATCH 01/13] Fix some NEON code that had bad compile-time checks (and some didn't compile) --- Common/Data/Convert/SmallDataConvert.h | 23 +++++++++++----------- Core/HLE/sceNetAdhoc.cpp | 2 +- Core/MIPS/ARM64/Arm64CompVFPU.cpp | 2 +- GPU/GPUCommon.cpp | 27 ++++++++++++++++++++++++++ GPU/GPUCommon.h | 19 +----------------- 5 files changed, 41 insertions(+), 32 deletions(-) diff --git a/Common/Data/Convert/SmallDataConvert.h b/Common/Data/Convert/SmallDataConvert.h index ac31d5116f7b..a877295918a7 100644 --- a/Common/Data/Convert/SmallDataConvert.h +++ b/Common/Data/Convert/SmallDataConvert.h @@ -9,7 +9,7 @@ #ifdef _M_SSE #include #endif -#if PPSSPP_PLATFORM(ARM_NEON) +#if PPSSPP_ARCH(ARM_NEON) #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) #include #else @@ -31,13 +31,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) { __m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero); __m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4)); _mm_storeu_ps(f, fvalues); -#elif PPSSPP_PLATFORM(ARM_NEON) - const float32x4_t one_over = vdupq_n_f32(1.0f/255.0f); - const uint8x8_t value = vld1_lane_u32(u); - const uint16x8_t value16 = vmovl_s8(value); - const uint32x4_t value32 = vmovl_s16(vget_low_s16(value16)); - const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), one_over); - vst1q_u32((uint32_t *)dest, valueFloat); +#elif PPSSPP_ARCH(ARM_NEON) + const uint8x8_t value = (uint8x8_t)vdup_n_u32(u); + const uint16x8_t value16 = vmovl_u8(value); + const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16)); + const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f)); + vst1q_f32(f, valueFloat); #else f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f); @@ -62,7 +61,7 @@ inline uint32_t Float4ToUint8x4(const float f[4]) { } inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) { -#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON) +#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON) Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24)); #else f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); @@ -73,7 +72,7 @@ inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) { } inline void Uint8x3ToFloat4(float f[4], uint32_t u) { -#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON) +#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON) Uint8x4ToFloat4(f, u & 0xFFFFFF); #else f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f); @@ -168,8 +167,8 @@ inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) { #ifdef _M_SSE __m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8); _mm_storeu_si128((__m128i *)dest, values); -#elif PPSSPP_PLATFORM(ARM_NEON) - const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8); +#elif PPSSPP_ARCH(ARM_NEON) + const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8); vst1q_u32((uint32_t *)dest, values); #else uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 }; diff --git a/Core/HLE/sceNetAdhoc.cpp b/Core/HLE/sceNetAdhoc.cpp index 88d8551eb4d7..6b5584678c4a 100644 --- a/Core/HLE/sceNetAdhoc.cpp +++ b/Core/HLE/sceNetAdhoc.cpp @@ -3318,7 +3318,7 @@ int RecreatePtpSocket(int ptpId) { WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Wrapped Port Detected: Original(%d) -> Requested(%d), Bound(%d) -> BoundOriginal(%d)", ptpId, sock->data.ptp.lport, requestedport, boundport, boundport - portOffset); u16 newlport = boundport - portOffset; if (newlport != sock->data.ptp.lport) { - WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Old and New LPort is different! The port may need to be reforwarded"); + WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Old and New LPort is different! The port may need to be reforwarded", ptpId); if (!sock->isClient) UPnP_Add(IP_PROTOCOL_TCP, isOriPort ? newlport : newlport + portOffset, newlport + portOffset); } diff --git a/Core/MIPS/ARM64/Arm64CompVFPU.cpp b/Core/MIPS/ARM64/Arm64CompVFPU.cpp index 0a636dcb39a0..fafd4fa20086 100644 --- a/Core/MIPS/ARM64/Arm64CompVFPU.cpp +++ b/Core/MIPS/ARM64/Arm64CompVFPU.cpp @@ -1792,7 +1792,7 @@ namespace MIPSComp { fpr.MapRegsAndSpillLockV(sregs, sz, 0); gpr.MapReg(MIPS_REG_VFPUCC); for (int i = 0; i < n; i++) { - TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << i); + TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << i); FixupBranch b = B(tf ? CC_NEQ : CC_EQ); fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i])); SetJumpTarget(b); diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index be7742de6f28..14a2f6e9c6f4 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -1,4 +1,16 @@ #include "ppsspp_config.h" + +#if defined(_M_SSE) +#include +#endif +#if PPSSPP_ARCH(ARM_NEON) +#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) +#include +#else +#include +#endif +#endif + #include #include #include @@ -2959,6 +2971,21 @@ bool GPUCommon::FramebufferReallyDirty() { return true; } +void GPUCommon::UpdateUVScaleOffset() { +#ifdef _M_SSE + __m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *) & gstate.texscaleu), 8); + _mm_storeu_si128((__m128i *) & gstate_c.uv, values); +#elif PPSSPP_ARCH(ARM_NEON) + const uint32x4_t values = vshlq_n_u32(vld1q_u32((const u32 *)&gstate.texscaleu), 8); + vst1q_u32((u32 *)&gstate_c.uv, values); +#else + gstate_c.uv.uScale = getFloat24(gstate.texscaleu); + gstate_c.uv.vScale = getFloat24(gstate.texscalev); + gstate_c.uv.uOff = getFloat24(gstate.texoffsetu); + gstate_c.uv.vOff = getFloat24(gstate.texoffsetv); +#endif +} + size_t GPUCommon::FormatGPUStatsCommon(char *buffer, size_t size) { float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f; return snprintf(buffer, size, diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h index 74ec59ac8770..880aceab2573 100644 --- a/GPU/GPUCommon.h +++ b/GPU/GPUCommon.h @@ -11,10 +11,6 @@ #include #endif -#if defined(_M_SSE) -#include -#endif - class FramebufferManagerCommon; class TextureCacheCommon; class DrawEngineCommon; @@ -218,20 +214,7 @@ class GPUCommon : public GPUInterface, public GPUDebugInterface { GPUgstate GetGState() override; void SetCmdValue(u32 op) override; - void UpdateUVScaleOffset() { -#ifdef _M_SSE - __m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8); - _mm_storeu_si128((__m128i *)&gstate_c.uv, values); -#elif PPSSPP_PLATFORM(ARM_NEON) - const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8); - vst1q_u32(&gstate_c.uv, values); -#else - gstate_c.uv.uScale = getFloat24(gstate.texscaleu); - gstate_c.uv.vScale = getFloat24(gstate.texscalev); - gstate_c.uv.uOff = getFloat24(gstate.texoffsetu); - gstate_c.uv.vOff = getFloat24(gstate.texoffsetv); -#endif - } + void UpdateUVScaleOffset(); DisplayList* getList(int listid) override { return &dls[listid]; From 9f7e0978a97da14de6df89ccadb18388d50cf404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 11 Apr 2022 20:10:22 +0200 Subject: [PATCH 02/13] AND together colors while decoding, and then check against fullAlphaMask. --- Common/Data/Convert/ColorConv.h | 4 + GPU/Common/TextureCacheCommon.cpp | 145 +++++++++++++++++++++++------- GPU/Common/TextureCacheCommon.h | 4 +- GPU/Common/TextureDecoder.cpp | 23 ++++- GPU/Common/TextureDecoder.h | 37 +++++--- GPU/D3D11/TextureCacheD3D11.cpp | 12 +-- GPU/Directx9/TextureCacheDX9.cpp | 11 +-- GPU/GLES/TextureCacheGLES.cpp | 11 +-- GPU/Vulkan/TextureCacheVulkan.cpp | 16 ++-- 9 files changed, 194 insertions(+), 69 deletions(-) diff --git a/Common/Data/Convert/ColorConv.h b/Common/Data/Convert/ColorConv.h index 7ba629c637f2..66bd47b9cf56 100644 --- a/Common/Data/Convert/ColorConv.h +++ b/Common/Data/Convert/ColorConv.h @@ -99,6 +99,10 @@ void convert5551_dx9(u16* data, u32* out, int width, int l, int u); // TODO: Need to revisit the naming convention of these. Seems totally backwards // now that we've standardized on Draw::DataFormat. +// +// The functions that have the same bit width of input and output can generally +// tolerate being called with src == dst, which is used a lot for ReverseColors +// in the GLES backend. void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels); #define ConvertRGBA8888ToBGRA8888 ConvertBGRA8888ToRGBA8888 diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 7397b22c96f4..84bbed70be30 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1306,6 +1306,8 @@ ReplacedTexture &TextureCacheCommon::FindReplacement(TexCacheEntry *entry, int & return replacer_.FindNone(); } +// This is only used in the GLES backend, where we don't point these to video memory. +// So we shouldn't add a check for dstBuf != srcBuf, as long as the functions we call can handle that. static void ReverseColors(void *dstBuf, const void *srcBuf, GETextureFormat fmt, int numPixels, bool useBGRA) { switch (fmt) { case GE_TFMT_4444: @@ -1353,7 +1355,10 @@ static inline void ConvertFormatToRGBA8888(GEPaletteFormat format, u32 *dst, con } template -static void DecodeDXTBlock(uint8_t *out, int outPitch, uint32_t texaddr, const uint8_t *texptr, int w, int h, int bufw, bool reverseColors, bool useBGRA) { +static void DecodeDXTBlocks(uint8_t *out, int outPitch, uint32_t texaddr, const uint8_t *texptr, + int w, int h, int bufw, bool reverseColors, bool useBGRA, + u32 *alphaSum, u32 *fullAlphaMask) { + int minw = std::min(bufw, w); uint32_t *dst = (uint32_t *)out; int outPitch32 = outPitch / sizeof(uint32_t); @@ -1366,26 +1371,88 @@ static void DecodeDXTBlock(uint8_t *out, int outPitch, uint32_t texaddr, const u h = (((int)limited / sizeof(DXTBlock)) / (bufw / 4)) * 4; } + *fullAlphaMask = 1; // we just use one bit here. + + u32 alpha = 0xFFFFFFFF; for (int y = 0; y < h; y += 4) { u32 blockIndex = (y / 4) * (bufw / 4); int blockHeight = std::min(h - y, 4); for (int x = 0; x < minw; x += 4) { - if (n == 1) - DecodeDXT1Block(dst + outPitch32 * y + x, (const DXT1Block *)src + blockIndex, outPitch32, blockHeight, false); - if (n == 3) + switch (n) { + case 1: + DecodeDXT1Block(dst + outPitch32 * y + x, (const DXT1Block *)src + blockIndex, outPitch32, blockHeight, &alpha); + break; + case 3: DecodeDXT3Block(dst + outPitch32 * y + x, (const DXT3Block *)src + blockIndex, outPitch32, blockHeight); - if (n == 5) + break; + case 5: DecodeDXT5Block(dst + outPitch32 * y + x, (const DXT5Block *)src + blockIndex, outPitch32, blockHeight); + break; + } blockIndex++; } } + + switch (n) { + case 1: + *alphaSum = alpha; + break; + case 3: + case 5: + // Just report that we don't have full alpha, since these formats are made for that. + *alphaSum = 0; + break; + } + w = (w + 3) & ~3; if (reverseColors) { ReverseColors(out, out, GE_TFMT_8888, outPitch32 * h, useBGRA); } } -void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit) { +inline u32 ClutFormatToFullAlpha(GEPaletteFormat fmt) { + switch (fmt) { + case GE_CMODE_16BIT_ABGR4444: return 0xF000; + case GE_CMODE_16BIT_ABGR5551: return 0x8000; + case GE_CMODE_16BIT_BGR5650: return 0x0000; + case GE_CMODE_32BIT_ABGR8888: return 0xFF000000; + } + return 0; +} + +inline u32 TfmtRawToFullAlpha(GETextureFormat fmt) { + switch (fmt) { + case GE_TFMT_4444: return 0xF000; + case GE_TFMT_5551: return 0x8000; + case GE_TFMT_5650: return 0x0000; + case GE_TFMT_8888: return 0xFF000000; + } + return 0; +} + +// TODO: SSE/SIMD +// At least on x86, compiler actually parallelizes these pretty well. +void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { + u16 mask = 0xFFFF; + for (int i = 0; i < width; i++) { + u16 color = src[i]; + mask &= color; + dst[i] = color; + } + *outMask &= (u32)mask; +} + +void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { + u32 mask = 0xFFFFFFFF; + for (int i = 0; i < width; i++) { + u32 color = src[i]; + mask &= color; + dst[i] = color; + } + *outMask &= (u32)mask; +} + +void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit, u32 *alphaSum, u32 *fullAlphaMask) { bool swizzled = gstate.isTextureSwizzled(); if ((texaddr & 0x00600000) != 0 && Memory::IsVRAMAddress(texaddr)) { // This means it's in a mirror, possibly a swizzled mirror. Let's report. @@ -1424,6 +1491,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_CMODE_16BIT_ABGR4444: { if (clutAlphaLinear_ && mipmapShareClut && !expandTo32bit) { + // We don't bother with fullalpha here // Here, reverseColors means the CLUT is already reversed. if (reverseColors) { for (int y = 0; y < h; ++y) { @@ -1439,12 +1507,14 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm if (expandTo32bit && !reverseColors) { // We simply expand the CLUT to 32-bit, then we deindex as usual. Probably the fastest way. ConvertFormatToRGBA8888(clutformat, expandClut_, clut, 16); + *fullAlphaMask = 0xFF000000; for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_); + DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_, alphaSum); } } else { + *fullAlphaMask = ClutFormatToFullAlpha(clutformat); for (int y = 0; y < h; ++y) { - DeIndexTexture4((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); + DeIndexTexture4((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, alphaSum); } } } @@ -1454,8 +1524,9 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_CMODE_32BIT_ABGR8888: { const u32 *clut = GetCurrentClut() + clutSharingOffset; + *fullAlphaMask = 0xFF000000; for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); + DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, alphaSum); } } break; @@ -1468,15 +1539,15 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm break; case GE_TFMT_CLUT8: - ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit); + ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit, alphaSum, fullAlphaMask); break; case GE_TFMT_CLUT16: - ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit); + ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit, alphaSum, fullAlphaMask); break; case GE_TFMT_CLUT32: - ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit); + ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit, alphaSum, fullAlphaMask); break; case GE_TFMT_4444: @@ -1485,41 +1556,48 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm if (!swizzled) { // Just a simple copy, we swizzle the color format. if (reverseColors) { + // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u16) * y, format, w, useBGRA); } } else if (expandTo32bit) { + // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)texptr + bufw * y, w); } } else { + *fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - memcpy(out + outPitch * y, texptr + bufw * sizeof(u16) * y, w * sizeof(u16)); + CopyAndSumMask16((u16 *)(out + outPitch * y), (u16 *)(texptr + bufw * sizeof(u16) * y), w, alphaSum); } } - } else if (h >= 8 && bufw <= w && !expandTo32bit) { + } /* else if (h >= 8 && bufw <= w && !expandTo32bit) { + // TODO: Handle alpha mask. This will require special versions of UnswizzleFromMem to keep the optimization. // Note: this is always safe since h must be a power of 2, so a multiple of 8. UnswizzleFromMem((u32 *)out, outPitch, texptr, bufw, h, 2); if (reverseColors) { ReverseColors(out, out, format, h * outPitch / 2, useBGRA); } - } else { + }*/ else { // We don't have enough space for all rows in out, so use a temp buffer. tmpTexBuf32_.resize(bufw * ((h + 7) & ~7)); UnswizzleFromMem(tmpTexBuf32_.data(), bufw * 2, texptr, bufw, h, 2); const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); if (reverseColors) { + // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u16) * y, format, w, useBGRA); } } else if (expandTo32bit) { + // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)unswizzled + bufw * y, w); } } else { + *fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - memcpy(out + outPitch * y, unswizzled + bufw * sizeof(u16) * y, w * sizeof(u16)); + CopyAndSumMask16((u16 *)(out + outPitch * y), (const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, alphaSum); } } } @@ -1528,47 +1606,51 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_TFMT_8888: if (!swizzled) { if (reverseColors) { + *fullAlphaMask = 0; // ignore alpha optimization for now for (int y = 0; y < h; ++y) { ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { + *fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - memcpy(out + outPitch * y, texptr + bufw * sizeof(u32) * y, w * sizeof(u32)); + CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(texptr + bufw * sizeof(u32) * y), w * sizeof(u32), alphaSum); } } - } else if (h >= 8 && bufw <= w) { + } /* else if (h >= 8 && bufw <= w) { + // TODO: Handle alpha mask UnswizzleFromMem((u32 *)out, outPitch, texptr, bufw, h, 4); if (reverseColors) { ReverseColors(out, out, format, h * outPitch / 4, useBGRA); } - } else { - // We don't have enough space for all rows in out, so use a temp buffer. + }*/ else { tmpTexBuf32_.resize(bufw * ((h + 7) & ~7)); UnswizzleFromMem(tmpTexBuf32_.data(), bufw * 4, texptr, bufw, h, 4); const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); if (reverseColors) { + // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { + *fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - memcpy(out + outPitch * y, unswizzled + bufw * sizeof(u32) * y, w * sizeof(u32)); + CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(unswizzled + bufw * sizeof(u32) * y), w * sizeof(u32), alphaSum); } } } break; case GE_TFMT_DXT1: - DecodeDXTBlock(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); break; case GE_TFMT_DXT3: - DecodeDXTBlock(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); break; case GE_TFMT_DXT5: - DecodeDXTBlock(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); break; default: @@ -1577,7 +1659,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm } } -void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit) { +void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask) { int w = gstate.getTextureWidth(level); int h = gstate.getTextureHeight(level); @@ -1596,6 +1678,7 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const ConvertFormatToRGBA8888(GEPaletteFormat(palFormat), expandClut_, clut16, 256); clut32 = expandClut_; palFormat = GE_CMODE_32BIT_ABGR8888; + *fullAlphaMask = 0xFF000000; } switch (palFormat) { @@ -1606,19 +1689,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16, alphaSum); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16, alphaSum); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16, alphaSum); } break; } @@ -1630,19 +1713,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32, alphaSum); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32, alphaSum); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32, alphaSum); } break; } diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index efaa1cbef5a6..caaea2fcf97f 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -275,9 +275,9 @@ class TextureCacheCommon { virtual void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) = 0; bool CheckFullHash(TexCacheEntry *entry, bool &doDelete); - void DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32Bit); + void DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask); void UnswizzleFromMem(u32 *dest, u32 destPitch, const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); - void ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit); + void ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask); ReplacedTexture &FindReplacement(TexCacheEntry *entry, int &w, int &h); template diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index a00b5ee529a7..08c0068702ba 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -432,9 +432,13 @@ class DXTDecoder { inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height); inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height); + bool AnyNonFullAlpha() const { return anyNonFullAlpha_; } + protected: u32 colors_[4]; u8 alpha_[8]; + bool alphaMode_ = false; + bool anyNonFullAlpha_ = false; }; static inline u32 makecol(int r, int g, int b, int a) { @@ -471,6 +475,9 @@ void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) { int blue3 = (blue1 + blue2) / 2; colors_[2] = makecol(red3, green3, blue3, alpha); colors_[3] = makecol(0, 0, 0, 0); + if (alpha == 255) { + alphaMode_ = true; + } } } @@ -508,14 +515,23 @@ void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) { } void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) { + bool anyColor3 = false; for (int y = 0; y < height; y++) { int colordata = src->lines[y]; for (int x = 0; x < 4; x++) { - dst[x] = colors_[colordata & 3]; + int col = colordata & 3; + if (col == 3) { + anyColor3 = true; + } + dst[x] = colors_[col]; colordata >>= 2; } dst += pitch; } + + if (alphaMode_ && anyColor3) { + anyNonFullAlpha_ = true; + } } void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) { @@ -610,10 +626,11 @@ uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) { } // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON. -void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) { +void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, u32 *alpha) { DXTDecoder dxt; - dxt.DecodeColors(src, ignore1bitAlpha); + dxt.DecodeColors(src, false); dxt.WriteColorsDXT1(dst, src, pitch, height); + *alpha = dxt.AnyNonFullAlpha() ? 0 : 1; } void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) { diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 6a0e59fea3a0..f85adbec55f3 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -65,7 +65,7 @@ struct DXT5Block { u8 alpha1; u8 alpha2; }; -void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha); +void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, u32 *alpha); void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height); void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height); @@ -94,15 +94,23 @@ static const u8 textureBitsPerPixel[16] = { u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format); +inline bool AlphaSumIsFull(u32 alphaSum, u32 fullAlphaMask) { + return fullAlphaMask != 0 && (alphaSum & fullAlphaMask) == fullAlphaMask; +} + template -inline void DeIndexTexture(ClutT *dest, const IndexT *indexed, int length, const ClutT *clut) { +inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const IndexT *indexed, int length, const ClutT *clut, u32 *outAlphaSum) { // Usually, there is no special offset, mask, or shift. const bool nakedIndex = gstate.isClutIndexSimple(); + ClutT alphaSum = (ClutT)(-1); + if (nakedIndex) { if (sizeof(IndexT) == 1) { for (int i = 0; i < length; ++i) { - *dest++ = clut[*indexed++]; + ClutT color = clut[*indexed++]; + *dest++ = color; + } } else { for (int i = 0; i < length; ++i) { @@ -117,29 +125,38 @@ inline void DeIndexTexture(ClutT *dest, const IndexT *indexed, int length, const } template -inline void DeIndexTexture(ClutT *dest, const u32 texaddr, int length, const ClutT *clut) { +inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const u32 texaddr, int length, const ClutT *clut, u32 *outAlphaSum) { const IndexT *indexed = (const IndexT *) Memory::GetPointer(texaddr); - DeIndexTexture(dest, indexed, length, clut); + DeIndexTexture(dest, indexed, length, clut, outAlphaSum); } template -inline void DeIndexTexture4(ClutT *dest, const u8 *indexed, int length, const ClutT *clut) { +inline void DeIndexTexture4(/*WRITEONLY*/ ClutT *dest, const u8 *indexed, int length, const ClutT *clut, u32 *outAlphaSum) { // Usually, there is no special offset, mask, or shift. const bool nakedIndex = gstate.isClutIndexSimple(); + ClutT alphaSum = (ClutT)(-1); if (nakedIndex) { for (int i = 0; i < length; i += 2) { u8 index = *indexed++; - dest[i + 0] = clut[(index >> 0) & 0xf]; - dest[i + 1] = clut[(index >> 4) & 0xf]; + ClutT color0 = clut[index & 0xf]; + ClutT color1 = clut[index >> 4]; + dest[i + 0] = color0; + dest[i + 1] = color1; + alphaSum &= color0 & color1; } } else { for (int i = 0; i < length; i += 2) { u8 index = *indexed++; - dest[i + 0] = clut[gstate.transformClutIndex((index >> 0) & 0xf)]; - dest[i + 1] = clut[gstate.transformClutIndex((index >> 4) & 0xf)]; + ClutT color0 = clut[gstate.transformClutIndex((index >> 0) & 0xf)]; + ClutT color1 = clut[gstate.transformClutIndex((index >> 4) & 0xf)]; + dest[i + 0] = color0; + dest[i + 1] = color1; + alphaSum &= color0 & color1; } } + + *outAlphaSum &= (u32)alphaSum; } template diff --git a/GPU/D3D11/TextureCacheD3D11.cpp b/GPU/D3D11/TextureCacheD3D11.cpp index 722eae29b4e7..c34bb968719d 100644 --- a/GPU/D3D11/TextureCacheD3D11.cpp +++ b/GPU/D3D11/TextureCacheD3D11.cpp @@ -708,16 +708,18 @@ void TextureCacheD3D11::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture & } bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS); - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); + u32 fullAlphaMask = 0; + u32 alphaSum = 0xFFFFFFFF; + DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32, &alphaSum, &fullAlphaMask); // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / bpp, w, h); - entry.SetAlphaStatus(alphaStatus, level); + if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); } + if (scaleFactor > 1) { u32 scaleFmt = (u32)dstFmt; scaler.ScaleAlways((u32 *)mapData, pixelData, scaleFmt, w, h, scaleFactor); diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index 23334baf8bb5..37d33921025a 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -634,14 +634,15 @@ void TextureCacheDX9::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &re decPitch = w * bpp; } - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, false); + u32 fullAlphaMask = 0; + u32 alphaSum = 0xFFFFFFFF; + DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, false, &alphaSum, &fullAlphaMask); // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / bpp, w, h); - entry.SetAlphaStatus(alphaStatus, level); + if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); } if (scaleFactor > 1) { diff --git a/GPU/GLES/TextureCacheGLES.cpp b/GPU/GLES/TextureCacheGLES.cpp index df9d3f88b9c9..0e53525689f9 100644 --- a/GPU/GLES/TextureCacheGLES.cpp +++ b/GPU/GLES/TextureCacheGLES.cpp @@ -675,14 +675,15 @@ void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &r decPitch = std::max(w * pixelSize, 4); pixelData = (uint8_t *)AllocateAlignedMemory(decPitch * h * pixelSize, 16); - DecodeTextureLevel(pixelData, decPitch, GETextureFormat(entry.format), clutformat, texaddr, level, bufw, true, false, false); + u32 fullAlphaMask = 0; + u32 alphaSum = 0xFFFFFFFF; + DecodeTextureLevel(pixelData, decPitch, GETextureFormat(entry.format), clutformat, texaddr, level, bufw, true, false, false, &alphaSum, &fullAlphaMask); // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / pixelSize, w, h); - entry.SetAlphaStatus(alphaStatus, level); + if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); } if (scaleFactor > 1) { diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index afb8a5ba4044..cc0c95499c1a 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -980,17 +980,17 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt } bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS) || dstFmt == VK_FORMAT_R8G8B8A8_UNORM; - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); + + u32 alphaSum = 0xFFFFFFFF; + u32 fullAlphaMask = 0x0; + + DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32, &alphaSum, &fullAlphaMask); gpuStats.numTexturesDecoded++; - // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { - // TODO: When we decode directly, this can be more expensive (maybe not on mobile?) - // This does allow us to skip alpha testing, though. - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / bpp, w, h); - entry.SetAlphaStatus(alphaStatus, level); + if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); } if (scaleFactor > 1) { From e6df3ab23a4e460354272bc17b6251df41f8f927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Mon, 11 Apr 2022 23:39:20 +0200 Subject: [PATCH 03/13] Comments --- GPU/Common/TextureCacheCommon.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 84bbed70be30..dd15638db8f6 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1431,7 +1431,7 @@ inline u32 TfmtRawToFullAlpha(GETextureFormat fmt) { } // TODO: SSE/SIMD -// At least on x86, compiler actually parallelizes these pretty well. +// At least on x86, compiler actually SIMDs these pretty well. void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { u16 mask = 0xFFFF; for (int i = 0; i < width; i++) { @@ -1491,7 +1491,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_CMODE_16BIT_ABGR4444: { if (clutAlphaLinear_ && mipmapShareClut && !expandTo32bit) { - // We don't bother with fullalpha here + // We don't bother with fullalpha here (clutAlphaLinear_) // Here, reverseColors means the CLUT is already reversed. if (reverseColors) { for (int y = 0; y < h; ++y) { From 35e0bfeaccccb59dfdfba80369928d03cbcdb12a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 12 Apr 2022 09:23:36 +0200 Subject: [PATCH 04/13] Fix DeIndexTexture --- GPU/Common/TextureCacheCommon.cpp | 2 +- GPU/Common/TextureDecoder.h | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index dd15638db8f6..a8ea77346c7f 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1313,7 +1313,7 @@ static void ReverseColors(void *dstBuf, const void *srcBuf, GETextureFormat fmt, case GE_TFMT_4444: ConvertRGBA4444ToABGR4444((u16 *)dstBuf, (const u16 *)srcBuf, numPixels); break; - // Final Fantasy 2 uses this heavily in animated textures. + // Final Fantasy 2 uses this heavily in animated textures. case GE_TFMT_5551: ConvertRGBA5551ToABGR1555((u16 *)dstBuf, (const u16 *)srcBuf, numPixels); break; diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index f85adbec55f3..51f879d7213b 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -109,17 +109,21 @@ inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const IndexT *indexed, int if (sizeof(IndexT) == 1) { for (int i = 0; i < length; ++i) { ClutT color = clut[*indexed++]; + alphaSum &= color; *dest++ = color; - } } else { for (int i = 0; i < length; ++i) { - *dest++ = clut[(*indexed++) & 0xFF]; + ClutT color = (*indexed++) & 0xFF; + alphaSum &= color; + *dest++ = color; } } } else { for (int i = 0; i < length; ++i) { - *dest++ = clut[gstate.transformClutIndex(*indexed++)]; + ClutT color = clut[gstate.transformClutIndex(*indexed++)]; + alphaSum &= color; + *dest++ = color; } } } From 613df29467d164535bc2624111ede48507ed05ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Apr 2022 09:54:48 +0200 Subject: [PATCH 05/13] Remove redundant line --- GPU/Common/TextureCacheCommon.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index a8ea77346c7f..de2775aad9ff 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1404,7 +1404,6 @@ static void DecodeDXTBlocks(uint8_t *out, int outPitch, uint32_t texaddr, const break; } - w = (w + 3) & ~3; if (reverseColors) { ReverseColors(out, out, GE_TFMT_8888, outPitch32 * h, useBGRA); } From 42cd937de2ceda3f3b4998303a584ebcd7dea6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 13 Apr 2022 23:44:03 +0200 Subject: [PATCH 06/13] Simplification and some cleanup --- GPU/Common/TextureCacheCommon.cpp | 52 +++++++++++++++++-------------- GPU/Common/TextureCacheCommon.h | 9 +++++- GPU/D3D11/TextureCacheD3D11.cpp | 12 ++----- GPU/Directx9/TextureCacheDX9.cpp | 12 ++----- GPU/GLES/TextureCacheGLES.cpp | 11 ++----- GPU/Vulkan/TextureCacheVulkan.cpp | 17 +++------- 6 files changed, 48 insertions(+), 65 deletions(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index de2775aad9ff..49bea7c2f637 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1451,13 +1451,17 @@ void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { *outMask &= (u32)mask; } -void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit, u32 *alphaSum, u32 *fullAlphaMask) { +CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit) { + u32 alphaSum = 0xFFFFFFFF; + u32 fullAlphaMask = 0x0; + bool swizzled = gstate.isTextureSwizzled(); if ((texaddr & 0x00600000) != 0 && Memory::IsVRAMAddress(texaddr)) { // This means it's in a mirror, possibly a swizzled mirror. Let's report. WARN_LOG_REPORT_ONCE(texmirror, G3D, "Decoding texture from VRAM mirror at %08x swizzle=%d", texaddr, swizzled ? 1 : 0); if ((texaddr & 0x00200000) == 0x00200000) { // Technically 2 and 6 are slightly different, but this is better than nothing probably. + // We should only see this with depth textures anyway which we don't support uploading (yet). swizzled = !swizzled; } // Note that (texaddr & 0x00600000) == 0x00600000 is very likely to be depth texturing. @@ -1506,14 +1510,14 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm if (expandTo32bit && !reverseColors) { // We simply expand the CLUT to 32-bit, then we deindex as usual. Probably the fastest way. ConvertFormatToRGBA8888(clutformat, expandClut_, clut, 16); - *fullAlphaMask = 0xFF000000; + fullAlphaMask = 0xFF000000; for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_, alphaSum); + DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_, &alphaSum); } } else { - *fullAlphaMask = ClutFormatToFullAlpha(clutformat); + fullAlphaMask = ClutFormatToFullAlpha(clutformat); for (int y = 0; y < h; ++y) { - DeIndexTexture4((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, alphaSum); + DeIndexTexture4((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, &alphaSum); } } } @@ -1523,30 +1527,30 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_CMODE_32BIT_ABGR8888: { const u32 *clut = GetCurrentClut() + clutSharingOffset; - *fullAlphaMask = 0xFF000000; + fullAlphaMask = 0xFF000000; for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, alphaSum); + DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut, &alphaSum); } } break; default: ERROR_LOG_REPORT(G3D, "Unknown CLUT4 texture mode %d", gstate.getClutPaletteFormat()); - return; + return CHECKALPHA_ANY; } } break; case GE_TFMT_CLUT8: - ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit, alphaSum, fullAlphaMask); + ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); break; case GE_TFMT_CLUT16: - ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit, alphaSum, fullAlphaMask); + ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); break; case GE_TFMT_CLUT32: - ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit, alphaSum, fullAlphaMask); + ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); break; case GE_TFMT_4444: @@ -1565,9 +1569,9 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)texptr + bufw * y, w); } } else { - *fullAlphaMask = TfmtRawToFullAlpha(format); + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - CopyAndSumMask16((u16 *)(out + outPitch * y), (u16 *)(texptr + bufw * sizeof(u16) * y), w, alphaSum); + CopyAndSumMask16((u16 *)(out + outPitch * y), (u16 *)(texptr + bufw * sizeof(u16) * y), w, &alphaSum); } } } /* else if (h >= 8 && bufw <= w && !expandTo32bit) { @@ -1594,9 +1598,9 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)unswizzled + bufw * y, w); } } else { - *fullAlphaMask = TfmtRawToFullAlpha(format); + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - CopyAndSumMask16((u16 *)(out + outPitch * y), (const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, alphaSum); + CopyAndSumMask16((u16 *)(out + outPitch * y), (const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); } } } @@ -1605,14 +1609,14 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_TFMT_8888: if (!swizzled) { if (reverseColors) { - *fullAlphaMask = 0; // ignore alpha optimization for now + fullAlphaMask = 0; // ignore alpha optimization for now for (int y = 0; y < h; ++y) { ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { - *fullAlphaMask = TfmtRawToFullAlpha(format); + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(texptr + bufw * sizeof(u32) * y), w * sizeof(u32), alphaSum); + CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(texptr + bufw * sizeof(u32) * y), w * sizeof(u32), &alphaSum); } } } /* else if (h >= 8 && bufw <= w) { @@ -1632,30 +1636,32 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { - *fullAlphaMask = TfmtRawToFullAlpha(format); + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(unswizzled + bufw * sizeof(u32) * y), w * sizeof(u32), alphaSum); + CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(unswizzled + bufw * sizeof(u32) * y), w * sizeof(u32), &alphaSum); } } } break; case GE_TFMT_DXT1: - DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, &alphaSum, &fullAlphaMask); break; case GE_TFMT_DXT3: - DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, &alphaSum, &fullAlphaMask); break; case GE_TFMT_DXT5: - DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, alphaSum, fullAlphaMask); + DecodeDXTBlocks(out, outPitch, texaddr, texptr, w, h, bufw, reverseColors, useBGRA, &alphaSum, &fullAlphaMask); break; default: ERROR_LOG_REPORT(G3D, "Unknown Texture Format %d!!!", format); break; } + + return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; } void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask) { diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index caaea2fcf97f..debefa1b3a60 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -181,6 +181,13 @@ struct TexCacheEntry { SetAlphaStatus(newStatus); } } + void SetAlphaStatus(CheckAlphaResult alphaResult, int level) { + TexStatus newStatus = (TexStatus)alphaResult; + // For non-level zero, only set more restrictive. + if (newStatus == STATUS_ALPHA_UNKNOWN || level == 0) { + SetAlphaStatus(newStatus); + } + } bool Matches(u16 dim2, u8 format2, u8 maxLevel2) const; u64 CacheKey() const; @@ -275,7 +282,7 @@ class TextureCacheCommon { virtual void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) = 0; bool CheckFullHash(TexCacheEntry *entry, bool &doDelete); - void DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask); + CheckAlphaResult DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32Bit); void UnswizzleFromMem(u32 *dest, u32 destPitch, const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); void ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask); ReplacedTexture &FindReplacement(TexCacheEntry *entry, int &w, int &h); diff --git a/GPU/D3D11/TextureCacheD3D11.cpp b/GPU/D3D11/TextureCacheD3D11.cpp index c34bb968719d..ee428c0ab679 100644 --- a/GPU/D3D11/TextureCacheD3D11.cpp +++ b/GPU/D3D11/TextureCacheD3D11.cpp @@ -708,17 +708,9 @@ void TextureCacheD3D11::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture & } bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS); - u32 fullAlphaMask = 0; - u32 alphaSum = 0xFFFFFFFF; - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32, &alphaSum, &fullAlphaMask); - - // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); - } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); - } + CheckAlphaResult alphaResult = DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); + entry.SetAlphaStatus(alphaResult, level); if (scaleFactor > 1) { u32 scaleFmt = (u32)dstFmt; diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index 37d33921025a..fb2efaa8e8bc 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -634,16 +634,8 @@ void TextureCacheDX9::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &re decPitch = w * bpp; } - u32 fullAlphaMask = 0; - u32 alphaSum = 0xFFFFFFFF; - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, false, &alphaSum, &fullAlphaMask); - - // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); - } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); - } + CheckAlphaResult alphaResult = DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, false); + entry.SetAlphaStatus(alphaResult, level); if (scaleFactor > 1) { scaler.ScaleAlways((u32 *)rect.pBits, pixelData, dstFmt, w, h, scaleFactor); diff --git a/GPU/GLES/TextureCacheGLES.cpp b/GPU/GLES/TextureCacheGLES.cpp index 0e53525689f9..7ec34e67117d 100644 --- a/GPU/GLES/TextureCacheGLES.cpp +++ b/GPU/GLES/TextureCacheGLES.cpp @@ -675,16 +675,9 @@ void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &r decPitch = std::max(w * pixelSize, 4); pixelData = (uint8_t *)AllocateAlignedMemory(decPitch * h * pixelSize, 16); - u32 fullAlphaMask = 0; - u32 alphaSum = 0xFFFFFFFF; - DecodeTextureLevel(pixelData, decPitch, GETextureFormat(entry.format), clutformat, texaddr, level, bufw, true, false, false, &alphaSum, &fullAlphaMask); - // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. - if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); - } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); - } + CheckAlphaResult alphaStatus = DecodeTextureLevel(pixelData, decPitch, GETextureFormat(entry.format), clutformat, texaddr, level, bufw, true, false, false); + entry.SetAlphaStatus(alphaStatus, level); if (scaleFactor > 1) { uint8_t *rearrange = (uint8_t *)AllocateAlignedMemory(w * scaleFactor * h * scaleFactor * 4, 16); diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index cc0c95499c1a..815758cd60fc 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -249,7 +249,7 @@ static std::string ReadShaderSrc(const Path &filename) { size_t sz = 0; char *data = (char *)VFSReadFile(filename.c_str(), &sz); if (!data) - return ""; + return std::string(); std::string src(data, sz); delete[] data; @@ -961,6 +961,8 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt int w = gstate.getTextureWidth(level); int h = gstate.getTextureHeight(level); + gpuStats.numTexturesDecoded++; + { PROFILE_THIS_SCOPE("decodetex"); @@ -981,17 +983,8 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS) || dstFmt == VK_FORMAT_R8G8B8A8_UNORM; - u32 alphaSum = 0xFFFFFFFF; - u32 fullAlphaMask = 0x0; - - DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32, &alphaSum, &fullAlphaMask); - gpuStats.numTexturesDecoded++; - - if (AlphaSumIsFull(alphaSum, fullAlphaMask)) { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_FULL, level); - } else { - entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN, level); - } + CheckAlphaResult alphaResult = DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); + entry.SetAlphaStatus(alphaResult, level); if (scaleFactor > 1) { u32 fmt = dstFmt; From a0ca968b1ef37f62c167f0e340d797a464564f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 14 Apr 2022 00:33:30 +0200 Subject: [PATCH 07/13] Bugfixes, handle the rest of the cases --- GPU/Common/TextureCacheCommon.cpp | 80 +++++++++++++++++++++++-------- GPU/Common/TextureCacheCommon.h | 2 +- GPU/Common/TextureDecoder.h | 2 + 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 49bea7c2f637..00de9e9aaee1 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1451,6 +1451,22 @@ void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { *outMask &= (u32)mask; } +void CheckMask16(const u16 *src, int width, u32 *outMask) { + u16 mask = 0xFFFF; + for (int i = 0; i < width; i++) { + mask &= src[i]; + } + *outMask &= (u32)mask; +} + +void CheckMask32(const u32 *src, int width, u32 *outMask) { + u32 mask = 0xFFFFFFFF; + for (int i = 0; i < width; i++) { + mask &= src[i]; + } + *outMask &= (u32)mask; +} + CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit) { u32 alphaSum = 0xFFFFFFFF; u32 fullAlphaMask = 0x0; @@ -1521,6 +1537,11 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G } } } + + if (clutformat == GE_CMODE_16BIT_BGR5650) { + // Our formula at the end of the function can't handle this cast so we return early. + return CHECKALPHA_FULL; + } } break; @@ -1542,16 +1563,13 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G break; case GE_TFMT_CLUT8: - ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); - break; + return ReadIndexedTex(out, outPitch, level, texptr, 1, bufw, expandTo32bit); case GE_TFMT_CLUT16: - ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); - break; + return ReadIndexedTex(out, outPitch, level, texptr, 2, bufw, expandTo32bit); case GE_TFMT_CLUT32: - ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit, &alphaSum, &fullAlphaMask); - break; + return ReadIndexedTex(out, outPitch, level, texptr, 4, bufw, expandTo32bit); case GE_TFMT_4444: case GE_TFMT_5551: @@ -1559,13 +1577,16 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G if (!swizzled) { // Just a simple copy, we swizzle the color format. if (reverseColors) { - // TODO: Handle alpha mask + // Just check the input's alpha to reuse code. TODO: make a specialized ReverseColors that checks as we go. + fullAlphaMask = TfmtRawToFullAlpha(format); + for (int y = 0; y < h; ++y) { + CheckMask16((const u16 *)(texptr + bufw * sizeof(u16) * y), w, &alphaSum); ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u16) * y, format, w, useBGRA); } } else if (expandTo32bit) { - // TODO: Handle alpha mask for (int y = 0; y < h; ++y) { + CheckMask16((const u16 *)(texptr + bufw * sizeof(u16) * y), w, &alphaSum); ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)texptr + bufw * y, w); } } else { @@ -1588,13 +1609,17 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); if (reverseColors) { - // TODO: Handle alpha mask + // Just check the swizzled input's alpha to reuse code. TODO: make a specialized ReverseColors that checks as we go. + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { + CheckMask16((const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u16) * y, format, w, useBGRA); } } else if (expandTo32bit) { - // TODO: Handle alpha mask + // Just check the swizzled input's alpha to reuse code. TODO: make a specialized ConvertFormatToRGBA8888 that checks as we go. + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { + CheckMask16((const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)unswizzled + bufw * y, w); } } else { @@ -1604,13 +1629,17 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G } } } + if (format == GE_TFMT_5650) { + return CHECKALPHA_FULL; + } break; case GE_TFMT_8888: if (!swizzled) { if (reverseColors) { - fullAlphaMask = 0; // ignore alpha optimization for now + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { + CheckMask32((const u32 *)(texptr + bufw * sizeof(u32) * y), w, &alphaSum); ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { @@ -1631,8 +1660,9 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); if (reverseColors) { - // TODO: Handle alpha mask + fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { + fullAlphaMask = TfmtRawToFullAlpha(format); ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { @@ -1664,7 +1694,7 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; } -void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask) { +CheckAlphaResult TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit) { int w = gstate.getTextureWidth(level); int h = gstate.getTextureHeight(level); @@ -1674,7 +1704,7 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const texptr = (u8 *)tmpTexBuf32_.data(); } - int palFormat = gstate.getClutPaletteFormat(); + GEPaletteFormat palFormat = (GEPaletteFormat)gstate.getClutPaletteFormat(); const u16 *clut16 = (const u16 *)clutBuf_; const u32 *clut32 = (const u32 *)clutBuf_; @@ -1683,9 +1713,11 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const ConvertFormatToRGBA8888(GEPaletteFormat(palFormat), expandClut_, clut16, 256); clut32 = expandClut_; palFormat = GE_CMODE_32BIT_ABGR8888; - *fullAlphaMask = 0xFF000000; } + u32 alphaSum = 0xFFFFFFFF; + u32 fullAlphaMask = ClutFormatToFullAlpha(palFormat); + switch (palFormat) { case GE_CMODE_16BIT_BGR5650: case GE_CMODE_16BIT_ABGR5551: @@ -1694,19 +1726,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16, alphaSum); + DeIndexTexture((u16 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16, &alphaSum); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16, alphaSum); + DeIndexTexture((u16 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16, &alphaSum); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16, alphaSum); + DeIndexTexture((u16 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16, &alphaSum); } break; } @@ -1718,19 +1750,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32, alphaSum); + DeIndexTexture((u32 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32, &alphaSum); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32, alphaSum); + DeIndexTexture((u32 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32, &alphaSum); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32, alphaSum); + DeIndexTexture((u32 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32, &alphaSum); } break; } @@ -1741,6 +1773,12 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const ERROR_LOG_REPORT(G3D, "Unhandled clut texture mode %d!!!", gstate.getClutPaletteFormat()); break; } + + if (palFormat == GE_CMODE_16BIT_BGR5650) { + return CHECKALPHA_FULL; + } else { + return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; + } } void TextureCacheCommon::ApplyTexture() { diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index debefa1b3a60..1c56656e27ed 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -284,7 +284,7 @@ class TextureCacheCommon { CheckAlphaResult DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32Bit); void UnswizzleFromMem(u32 *dest, u32 destPitch, const u8 *texptr, u32 bufw, u32 height, u32 bytesPerPixel); - void ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit, u32 *alphaSum, u32 *fullAlphaMask); + CheckAlphaResult ReadIndexedTex(u8 *out, int outPitch, int level, const u8 *texptr, int bytesPerIndex, int bufw, bool expandTo32Bit); ReplacedTexture &FindReplacement(TexCacheEntry *entry, int &w, int &h); template diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 51f879d7213b..edfbe7972411 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -126,6 +126,8 @@ inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const IndexT *indexed, int *dest++ = color; } } + + *outAlphaSum = alphaSum; } template From 3bf9ea8de692a3bf9f0961f8e73c7206698de6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 14 Apr 2022 00:36:06 +0200 Subject: [PATCH 08/13] Debug log (remove before merge) --- GPU/Vulkan/TextureCacheVulkan.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index 815758cd60fc..c31d2286b033 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -984,6 +984,8 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS) || dstFmt == VK_FORMAT_R8G8B8A8_UNORM; CheckAlphaResult alphaResult = DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); + + WARN_LOG(G3D, "Alpha: full=%d w=%d h=%d level=%d %s/%s", (int)(alphaResult == CHECKALPHA_FULL), w, h, level, GeTextureFormatToString(tfmt), GEPaletteFormatToString(clutformat)); entry.SetAlphaStatus(alphaResult, level); if (scaleFactor > 1) { From 1dae81a98a51c329fb8cdee6fa73a63910a836af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 14 Apr 2022 01:03:42 +0200 Subject: [PATCH 09/13] Yet another bugfix --- GPU/Common/TextureDecoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index edfbe7972411..9c13f6b9ddc4 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -127,7 +127,7 @@ inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const IndexT *indexed, int } } - *outAlphaSum = alphaSum; + *outAlphaSum &= (u32)alphaSum; } template From 8bc2d1a653fc5845c2a30c3a9f5f83c3790d7005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 14 Apr 2022 01:32:16 +0200 Subject: [PATCH 10/13] SSE optimize a common case for video --- GPU/Common/TextureCacheCommon.cpp | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 00de9e9aaee1..9617ab9480f6 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1429,6 +1429,15 @@ inline u32 TfmtRawToFullAlpha(GETextureFormat fmt) { return 0; } +#ifdef _M_SSE +inline u32 SSEReduce32And(__m128i value) { + // TODO: Should use a shuffle instead of slri, probably. + value = _mm_and_si128(value, _mm_srli_si128(value, 64)); + value = _mm_and_si128(value, _mm_srli_si128(value, 32)); + return _mm_cvtsi128_si32(value); +} +#endif + // TODO: SSE/SIMD // At least on x86, compiler actually SIMDs these pretty well. void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { @@ -1441,8 +1450,23 @@ void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { *outMask &= (u32)mask; } +// Used in video playback so nice to have being fast. void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { u32 mask = 0xFFFFFFFF; +#ifdef _M_SSE + if (width >= 4) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 4) { + __m128i color = _mm_loadu_si128((__m128i *)src); + wideMask = _mm_and_si128(wideMask, color); + _mm_storeu_si128((__m128i *)dst, color); + src += 4; + dst += 4; + width -= 4; + } + mask = SSEReduce32And(wideMask); + } +#endif for (int i = 0; i < width; i++) { u32 color = src[i]; mask &= color; @@ -1461,6 +1485,18 @@ void CheckMask16(const u16 *src, int width, u32 *outMask) { void CheckMask32(const u32 *src, int width, u32 *outMask) { u32 mask = 0xFFFFFFFF; +#ifdef _M_SSE + if (width >= 4) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 4) { + wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src)); + src += 4; + width -= 4; + } + mask = SSEReduce32And(wideMask); + } +#endif + for (int i = 0; i < width; i++) { mask &= src[i]; } From ffcfef60311d37dc4768b4ec28afe69a2547edde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 15 Apr 2022 00:10:34 +0200 Subject: [PATCH 11/13] Quick NEON optimization of CheckMask32 --- GPU/Common/TextureCacheCommon.cpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 9617ab9480f6..b818a53d5151 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -15,9 +15,10 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "ppsspp_config.h" + #include -#include "ppsspp_config.h" #include "Common/Common.h" #include "Common/Data/Convert/ColorConv.h" #include "Common/Profiler/Profiler.h" @@ -1436,6 +1437,20 @@ inline u32 SSEReduce32And(__m128i value) { value = _mm_and_si128(value, _mm_srli_si128(value, 32)); return _mm_cvtsi128_si32(value); } +inline u32 SSEReduce16And(__m128i value) { + // TODO: Should use a shuffle instead of slri, probably. + value = _mm_and_si128(value, _mm_srli_si128(value, 64)); + value = _mm_and_si128(value, _mm_srli_si128(value, 32)); + value = _mm_and_si128(value, _mm_srli_si128(value, 16)); + return _mm_cvtsi128_si32(value); +} +#endif + +#if PPSSPP_ARCH(ARM_NEON) +inline u32 NEONReduce32And(uint32x4_t value) { + // TODO: Maybe a shuffle and a vector and, or something? + return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3); +} #endif // TODO: SSE/SIMD @@ -1467,6 +1482,7 @@ void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { mask = SSEReduce32And(wideMask); } #endif + for (int i = 0; i < width; i++) { u32 color = src[i]; mask &= color; @@ -1495,6 +1511,16 @@ void CheckMask32(const u32 *src, int width, u32 *outMask) { } mask = SSEReduce32And(wideMask); } +#elif PPSSPP_ARCH(ARM_NEON) + if (width >= 4) { + uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); + while (width >= 4) { + wideMask = vandq_u32(wideMask, vld1q_u32(src)); + src += 4; + width -= 4; + } + mask = NEONReduce32And(wideMask); + } #endif for (int i = 0; i < width; i++) { From a3d650d3e9d3ef870c3fc33d454fb5121980787f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 15 Apr 2022 01:03:55 +0200 Subject: [PATCH 12/13] One more NEON optimization --- GPU/Common/TextureCacheCommon.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index b818a53d5151..d4427bc0c2e7 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1481,6 +1481,19 @@ void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { } mask = SSEReduce32And(wideMask); } +#elif PPSSPP_ARCH(ARM_NEON) + if (width >= 4) { + uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); + while (width >= 4) { + uint32x4_t colors = vld1q_u32(src); + wideMask = vandq_u32(wideMask, colors); + vst1q_u32(dst, colors); + src += 4; + dst += 4; + width -= 4; + } + mask = NEONReduce32And(wideMask); + } #endif for (int i = 0; i < width; i++) { From a5ee1884c1c8242d5a574066a2dffcbafa82b9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 15 Apr 2022 01:08:14 +0200 Subject: [PATCH 13/13] Address feedback --- GPU/Common/TextureCacheCommon.cpp | 16 +++++----------- GPU/Common/TextureDecoder.cpp | 2 +- GPU/Vulkan/TextureCacheVulkan.cpp | 2 +- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index d4427bc0c2e7..c3f56df4b7c5 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1651,10 +1651,9 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G case GE_TFMT_5650: if (!swizzled) { // Just a simple copy, we swizzle the color format. + fullAlphaMask = TfmtRawToFullAlpha(format); if (reverseColors) { // Just check the input's alpha to reuse code. TODO: make a specialized ReverseColors that checks as we go. - fullAlphaMask = TfmtRawToFullAlpha(format); - for (int y = 0; y < h; ++y) { CheckMask16((const u16 *)(texptr + bufw * sizeof(u16) * y), w, &alphaSum); ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u16) * y, format, w, useBGRA); @@ -1665,7 +1664,6 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)texptr + bufw * y, w); } } else { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CopyAndSumMask16((u16 *)(out + outPitch * y), (u16 *)(texptr + bufw * sizeof(u16) * y), w, &alphaSum); } @@ -1683,22 +1681,20 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G UnswizzleFromMem(tmpTexBuf32_.data(), bufw * 2, texptr, bufw, h, 2); const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); + fullAlphaMask = TfmtRawToFullAlpha(format); if (reverseColors) { // Just check the swizzled input's alpha to reuse code. TODO: make a specialized ReverseColors that checks as we go. - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CheckMask16((const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u16) * y, format, w, useBGRA); } } else if (expandTo32bit) { // Just check the swizzled input's alpha to reuse code. TODO: make a specialized ConvertFormatToRGBA8888 that checks as we go. - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CheckMask16((const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)unswizzled + bufw * y, w); } } else { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CopyAndSumMask16((u16 *)(out + outPitch * y), (const u16 *)(unswizzled + bufw * sizeof(u16) * y), w, &alphaSum); } @@ -1711,14 +1707,13 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G case GE_TFMT_8888: if (!swizzled) { + fullAlphaMask = TfmtRawToFullAlpha(format); if (reverseColors) { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CheckMask32((const u32 *)(texptr + bufw * sizeof(u32) * y), w, &alphaSum); ReverseColors(out + outPitch * y, texptr + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(texptr + bufw * sizeof(u32) * y), w * sizeof(u32), &alphaSum); } @@ -1734,14 +1729,13 @@ CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, G UnswizzleFromMem(tmpTexBuf32_.data(), bufw * 4, texptr, bufw, h, 4); const u8 *unswizzled = (u8 *)tmpTexBuf32_.data(); + fullAlphaMask = TfmtRawToFullAlpha(format); if (reverseColors) { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { - fullAlphaMask = TfmtRawToFullAlpha(format); + CheckMask32((const u32 *)(unswizzled + bufw * sizeof(u32) * y), w, &alphaSum); ReverseColors(out + outPitch * y, unswizzled + bufw * sizeof(u32) * y, format, w, useBGRA); } } else { - fullAlphaMask = TfmtRawToFullAlpha(format); for (int y = 0; y < h; ++y) { CopyAndSumMask32((u32 *)(out + outPitch * y), (const u32 *)(unswizzled + bufw * sizeof(u32) * y), w * sizeof(u32), &alphaSum); } diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 08c0068702ba..d3df034ec360 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -630,7 +630,7 @@ void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, u32 DXTDecoder dxt; dxt.DecodeColors(src, false); dxt.WriteColorsDXT1(dst, src, pitch, height); - *alpha = dxt.AnyNonFullAlpha() ? 0 : 1; + *alpha = dxt.AnyNonFullAlpha() ? 0 : 0xFFFFFFFF; } void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) { diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index c31d2286b033..0453b648506f 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -985,7 +985,7 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt CheckAlphaResult alphaResult = DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, expand32); - WARN_LOG(G3D, "Alpha: full=%d w=%d h=%d level=%d %s/%s", (int)(alphaResult == CHECKALPHA_FULL), w, h, level, GeTextureFormatToString(tfmt), GEPaletteFormatToString(clutformat)); + // WARN_LOG(G3D, "Alpha: full=%d w=%d h=%d level=%d %s/%s", (int)(alphaResult == CHECKALPHA_FULL), w, h, level, GeTextureFormatToString(tfmt), GEPaletteFormatToString(clutformat)); entry.SetAlphaStatus(alphaResult, level); if (scaleFactor > 1) {