diff --git a/Core/TextureReplacer.cpp b/Core/TextureReplacer.cpp index 6d64652a9f1d..525cabe52cf1 100644 --- a/Core/TextureReplacer.cpp +++ b/Core/TextureReplacer.cpp @@ -839,6 +839,7 @@ void ReplacedTexture::PrepareData(int level) { int w, h, f; uint8_t *image; + if (LoadZIMPtr(&zim[0], zimSize, &w, &h, &f, &image)) { if (w > info.w || h > info.h) { ERROR_LOG(G3D, "Texture replacement changed since header read: %s", info.file.c_str()); @@ -857,7 +858,7 @@ void ReplacedTexture::PrepareData(int level) { free(image); } - CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32 *)&out[0], info.w, w, h); + CheckAlphaResult res = CheckAlpha32Rect((u32 *)&out[0], info.w, w, h, 0xFF000000); if (res == CHECKALPHA_ANY || level == 0) { alphaStatus_ = ReplacedTextureAlpha(res); } @@ -897,7 +898,7 @@ void ReplacedTexture::PrepareData(int level) { if (!checkedAlpha) { // This will only check the hashed bits. - CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32 *)&out[0], info.w, png.width, png.height); + CheckAlphaResult res = CheckAlpha32Rect((u32 *)&out[0], info.w, png.width, png.height, 0xFF000000); if (res == CHECKALPHA_ANY || level == 0) { alphaStatus_ = ReplacedTextureAlpha(res); } diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index c3f56df4b7c5..1273a42588f2 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -1430,118 +1430,6 @@ inline u32 TfmtRawToFullAlpha(GETextureFormat fmt) { return 0; } -#ifdef _M_SSE -inline u32 SSEReduce32And(__m128i value) { - // TODO: Should use a shuffle instead of slri, probably. - value = _mm_and_si128(value, _mm_srli_si128(value, 64)); - value = _mm_and_si128(value, _mm_srli_si128(value, 32)); - return _mm_cvtsi128_si32(value); -} -inline u32 SSEReduce16And(__m128i value) { - // TODO: Should use a shuffle instead of slri, probably. - value = _mm_and_si128(value, _mm_srli_si128(value, 64)); - value = _mm_and_si128(value, _mm_srli_si128(value, 32)); - value = _mm_and_si128(value, _mm_srli_si128(value, 16)); - return _mm_cvtsi128_si32(value); -} -#endif - -#if PPSSPP_ARCH(ARM_NEON) -inline u32 NEONReduce32And(uint32x4_t value) { - // TODO: Maybe a shuffle and a vector and, or something? - return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3); -} -#endif - -// TODO: SSE/SIMD -// At least on x86, compiler actually SIMDs these pretty well. -void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { - u16 mask = 0xFFFF; - for (int i = 0; i < width; i++) { - u16 color = src[i]; - mask &= color; - dst[i] = color; - } - *outMask &= (u32)mask; -} - -// Used in video playback so nice to have being fast. -void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { - u32 mask = 0xFFFFFFFF; -#ifdef _M_SSE - if (width >= 4) { - __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); - while (width >= 4) { - __m128i color = _mm_loadu_si128((__m128i *)src); - wideMask = _mm_and_si128(wideMask, color); - _mm_storeu_si128((__m128i *)dst, color); - src += 4; - dst += 4; - width -= 4; - } - mask = SSEReduce32And(wideMask); - } -#elif PPSSPP_ARCH(ARM_NEON) - if (width >= 4) { - uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); - while (width >= 4) { - uint32x4_t colors = vld1q_u32(src); - wideMask = vandq_u32(wideMask, colors); - vst1q_u32(dst, colors); - src += 4; - dst += 4; - width -= 4; - } - mask = NEONReduce32And(wideMask); - } -#endif - - for (int i = 0; i < width; i++) { - u32 color = src[i]; - mask &= color; - dst[i] = color; - } - *outMask &= (u32)mask; -} - -void CheckMask16(const u16 *src, int width, u32 *outMask) { - u16 mask = 0xFFFF; - for (int i = 0; i < width; i++) { - mask &= src[i]; - } - *outMask &= (u32)mask; -} - -void CheckMask32(const u32 *src, int width, u32 *outMask) { - u32 mask = 0xFFFFFFFF; -#ifdef _M_SSE - if (width >= 4) { - __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); - while (width >= 4) { - wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src)); - src += 4; - width -= 4; - } - mask = SSEReduce32And(wideMask); - } -#elif PPSSPP_ARCH(ARM_NEON) - if (width >= 4) { - uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); - while (width >= 4) { - wideMask = vandq_u32(wideMask, vld1q_u32(src)); - src += 4; - width -= 4; - } - mask = NEONReduce32And(wideMask); - } -#endif - - for (int i = 0; i < width; i++) { - mask &= src[i]; - } - *outMask &= (u32)mask; -} - CheckAlphaResult TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureFormat format, GEPaletteFormat clutformat, uint32_t texaddr, int level, int bufw, bool reverseColors, bool useBGRA, bool expandTo32bit) { u32 alphaSum = 0xFFFFFFFF; u32 fullAlphaMask = 0x0; diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index d3df034ec360..f769e3858a90 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -41,6 +41,13 @@ #endif #endif +#ifdef __clang__ +// Weird how you can't just use #pragma in a macro. +#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)") +#else +#define DO_NOT_VECTORIZE_LOOP +#endif + #ifdef _M_SSE static u32 QuickTexHashSSE2(const void *checkp, u32 size) { @@ -647,417 +654,173 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) { } #ifdef _M_SSE -static inline u32 CombineSSEBitsToDWORD(const __m128i &v) { - __m128i temp; - temp = _mm_or_si128(v, _mm_srli_si128(v, 8)); - temp = _mm_or_si128(temp, _mm_srli_si128(temp, 4)); - return _mm_cvtsi128_si32(temp); +inline u32 SSEReduce32And(__m128i value) { + // TODO: Should use a shuffle instead of slri, probably. + value = _mm_and_si128(value, _mm_srli_si128(value, 64)); + value = _mm_and_si128(value, _mm_srli_si128(value, 32)); + return _mm_cvtsi128_si32(value); +} +inline u32 SSEReduce16And(__m128i value) { + // TODO: Should use a shuffle instead of slri, probably. + value = _mm_and_si128(value, _mm_srli_si128(value, 64)); + value = _mm_and_si128(value, _mm_srli_si128(value, 32)); + u32 mask = _mm_cvtsi128_si32(value); + return mask & (mask >> 16); } +#endif -CheckAlphaResult CheckAlphaRGBA8888SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i mask = _mm_set1_epi32(0xFF000000); - - const __m128i *p = (const __m128i *)pixelData; - const int w4 = w / 4; - const int stride4 = stride / 4; - - __m128i bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w4; ++i) { - const __m128i a = _mm_load_si128(&p[i]); - bits = _mm_and_si128(bits, a); - } - - __m128i result = _mm_xor_si128(bits, mask); - if (CombineSSEBitsToDWORD(result) != 0) { - return CHECKALPHA_ANY; - } - - p += stride4; - } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaABGR4444SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i mask = _mm_set1_epi16((short)0x000F); - - const __m128i *p = (const __m128i *)pixelData; - const int w8 = w / 8; - const int stride8 = stride / 8; - - __m128i bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_load_si128(&p[i]); - bits = _mm_and_si128(bits, a); - } - - __m128i result = _mm_xor_si128(bits, mask); - if (CombineSSEBitsToDWORD(result) != 0) { - return CHECKALPHA_ANY; - } - - p += stride8; - } - - return CHECKALPHA_FULL; +#if PPSSPP_ARCH(ARM_NEON) +inline u32 NEONReduce32And(uint32x4_t value) { + // TODO: Maybe a shuffle and a vector and, or something? + return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3); } - -CheckAlphaResult CheckAlphaABGR1555SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i mask = _mm_set1_epi16((short)0x0001); - - const __m128i *p = (const __m128i *)pixelData; - const int w8 = w / 8; - const int stride8 = stride / 8; - - __m128i bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_load_si128(&p[i]); - bits = _mm_and_si128(bits, a); - } - - __m128i result = _mm_xor_si128(bits, mask); - if (CombineSSEBitsToDWORD(result) != 0) { - return CHECKALPHA_ANY; - } - - p += stride8; - } - - return CHECKALPHA_FULL; +inline u32 NEONReduce16And(uint16x8_t value) { + uint32x4_t value32 = vreinterpretq_u32_u16(value); + // TODO: Maybe a shuffle and a vector and, or something? + u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3); + return mask & (mask >> 16); } +#endif -CheckAlphaResult CheckAlphaRGBA4444SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i mask = _mm_set1_epi16((short)0xF000); - - const __m128i *p = (const __m128i *)pixelData; - const int w8 = w / 8; - const int stride8 = stride / 8; - - __m128i bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_load_si128(&p[i]); - bits = _mm_and_si128(bits, a); - } - - __m128i result = _mm_xor_si128(bits, mask); - if (CombineSSEBitsToDWORD(result) != 0) { - return CHECKALPHA_ANY; - } - - p += stride8; +// TODO: SSE/SIMD +// At least on x86, compiler actually SIMDs these pretty well. +void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) { + u16 mask = 0xFFFF; +#ifdef _M_SSE + if (width >= 8) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 8) { + __m128i color = _mm_loadu_si128((__m128i *)src); + wideMask = _mm_and_si128(wideMask, color); + _mm_storeu_si128((__m128i *)dst, color); + src += 8; + dst += 8; + width -= 8; + } + mask = SSEReduce16And(wideMask); } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, int h) { - const __m128i mask = _mm_set1_epi16((short)0x8000); - - const __m128i *p = (const __m128i *)pixelData; - const int w8 = w / 8; - const int stride8 = stride / 8; - - __m128i bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w8; ++i) { - const __m128i a = _mm_load_si128(&p[i]); - bits = _mm_and_si128(bits, a); - } - - __m128i result = _mm_xor_si128(bits, mask); - if (CombineSSEBitsToDWORD(result) != 0) { - return CHECKALPHA_ANY; - } - - p += stride8; +#elif PPSSPP_ARCH(ARM_NEON) + if (width >= 8) { + uint16x8_t wideMask = vdupq_n_u16(0xFFFF); + while (width >= 8) { + uint16x8_t colors = vld1q_u16(src); + wideMask = vandq_u16(wideMask, colors); + vst1q_u16(dst, colors); + src += 8; + dst += 8; + width -= 8; + } + mask = NEONReduce16And(wideMask); } - - return CHECKALPHA_FULL; -} - -#endif // _M_SSE - -#if PPSSPP_ARCH(ARM_NEON) - -static inline bool VectorIsNonZeroNEON(const uint32x4_t &v) { - u64 low = vgetq_lane_u64(vreinterpretq_u64_u32(v), 0); - u64 high = vgetq_lane_u64(vreinterpretq_u64_u32(v), 1); - - return (low | high) != 0; -} - -#ifndef _MSC_VER -// MSVC consider this function the same as the one above! uint16x8_t is typedef'd to the same type as uint32x4_t. -static inline bool VectorIsNonZeroNEON(const uint16x8_t &v) { - u64 low = vgetq_lane_u64(vreinterpretq_u64_u16(v), 0); - u64 high = vgetq_lane_u64(vreinterpretq_u64_u16(v), 1); - - return (low | high) != 0; -} #endif -CheckAlphaResult CheckAlphaRGBA8888NEON(const u32 *pixelData, int stride, int w, int h) { - const u32 *p = (const u32 *)pixelData; - - const uint32x4_t mask = vdupq_n_u32(0xFF000000); - uint32x4_t bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w; i += 4) { - const uint32x4_t a = vld1q_u32(&p[i]); - bits = vandq_u32(bits, a); - } - - uint32x4_t result = veorq_u32(bits, mask); - if (VectorIsNonZeroNEON(result)) { - return CHECKALPHA_ANY; - } - - p += stride; + DO_NOT_VECTORIZE_LOOP + for (int i = 0; i < width; i++) { + u16 color = src[i]; + mask &= color; + dst[i] = color; } - - return CHECKALPHA_FULL; + *outMask &= (u32)mask; } -CheckAlphaResult CheckAlphaABGR4444NEON(const u32 *pixelData, int stride, int w, int h) { - const u16 *p = (const u16 *)pixelData; - - const uint16x8_t mask = vdupq_n_u16((u16)0x000F); - uint16x8_t bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w; i += 8) { - const uint16x8_t a = vld1q_u16(&p[i]); - bits = vandq_u16(bits, a); - } - - uint16x8_t result = veorq_u16(bits, mask); - if (VectorIsNonZeroNEON(result)) { - return CHECKALPHA_ANY; - } - - p += stride; +// Used in video playback so nice to have being fast. +void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) { + u32 mask = 0xFFFFFFFF; +#ifdef _M_SSE + if (width >= 4) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 4) { + __m128i color = _mm_loadu_si128((__m128i *)src); + wideMask = _mm_and_si128(wideMask, color); + _mm_storeu_si128((__m128i *)dst, color); + src += 4; + dst += 4; + width -= 4; + } + mask = SSEReduce32And(wideMask); } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaABGR1555NEON(const u32 *pixelData, int stride, int w, int h) { - const u16 *p = (const u16 *)pixelData; - - const uint16x8_t mask = vdupq_n_u16((u16)0x0001); - uint16x8_t bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w; i += 8) { - const uint16x8_t a = vld1q_u16(&p[i]); - bits = vandq_u16(bits, a); - } - - uint16x8_t result = veorq_u16(bits, mask); - if (VectorIsNonZeroNEON(result)) { - return CHECKALPHA_ANY; - } - - p += stride; +#elif PPSSPP_ARCH(ARM_NEON) + if (width >= 4) { + uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); + while (width >= 4) { + uint32x4_t colors = vld1q_u32(src); + wideMask = vandq_u32(wideMask, colors); + vst1q_u32(dst, colors); + src += 4; + dst += 4; + width -= 4; + } + mask = NEONReduce32And(wideMask); } +#endif - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaRGBA4444NEON(const u32 *pixelData, int stride, int w, int h) { - const u16 *p = (const u16 *)pixelData; - - const uint16x8_t mask = vdupq_n_u16((u16)0xF000); - uint16x8_t bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w; i += 8) { - const uint16x8_t a = vld1q_u16(&p[i]); - bits = vandq_u16(bits, a); - } - - uint16x8_t result = veorq_u16(bits, mask); - if (VectorIsNonZeroNEON(result)) { - return CHECKALPHA_ANY; - } - - p += stride; + DO_NOT_VECTORIZE_LOOP + for (int i = 0; i < width; i++) { + u32 color = src[i]; + mask &= color; + dst[i] = color; } - - return CHECKALPHA_FULL; + *outMask &= (u32)mask; } -CheckAlphaResult CheckAlphaRGBA5551NEON(const u32 *pixelData, int stride, int w, int h) { - const u16 *p = (const u16 *)pixelData; - - const uint16x8_t mask = vdupq_n_u16((u16)0x8000); - uint16x8_t bits = mask; - for (int y = 0; y < h; ++y) { - for (int i = 0; i < w; i += 8) { - const uint16x8_t a = vld1q_u16(&p[i]); - bits = vandq_u16(bits, a); - } - - uint16x8_t result = veorq_u16(bits, mask); - if (VectorIsNonZeroNEON(result)) { - return CHECKALPHA_ANY; +void CheckMask16(const u16 *src, int width, u32 *outMask) { + u16 mask = 0xFFFF; +#ifdef _M_SSE + if (width >= 8) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 8) { + wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src)); + src += 8; + width -= 8; } - - p += stride; + mask = SSEReduce16And(wideMask); } - - return CHECKALPHA_FULL; -} - -#endif - -CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) { - // Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.) - if ((w & 3) == 0 && (stride & 3) == 0) { -#ifdef _M_SSE - return CheckAlphaRGBA8888SSE2(pixelData, stride, w, h); #elif PPSSPP_ARCH(ARM_NEON) - return CheckAlphaRGBA8888NEON(pixelData, stride, w, h); -#endif - } - - const u32 *p = pixelData; - for (int y = 0; y < h; ++y) { - u32 bits = 0xFF000000; - for (int i = 0; i < w; ++i) { - bits &= p[i]; + if (width >= 8) { + uint16x8_t wideMask = vdupq_n_u16(0xFFFF); + while (width >= 8) { + wideMask = vandq_u16(wideMask, vld1q_u16(src)); + src += 8; + width -= 8; } - - if (bits != 0xFF000000) { - // We're done, we hit non-full alpha. - return CHECKALPHA_ANY; - } - - p += stride; + mask = NEONReduce16And(wideMask); } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) { - // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) - if ((w & 7) == 0 && (stride & 7) == 0) { -#ifdef _M_SSE - return CheckAlphaABGR4444SSE2(pixelData, stride, w, h); -#elif PPSSPP_ARCH(ARM_NEON) - return CheckAlphaABGR4444NEON(pixelData, stride, w, h); #endif - } - const u32 *p = pixelData; - const int w2 = (w + 1) / 2; - const int stride2 = (stride + 1) / 2; - - for (int y = 0; y < h; ++y) { - u32 bits = 0x000F000F; - for (int i = 0; i < w2; ++i) { - bits &= p[i]; - } - - if (bits != 0x000F000F) { - // We're done, we hit non-full alpha. - return CHECKALPHA_ANY; - } - - p += stride2; + DO_NOT_VECTORIZE_LOOP + for (int i = 0; i < width; i++) { + mask &= src[i]; } - - return CHECKALPHA_FULL; + *outMask &= (u32)mask; } -CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) { - // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) - if ((w & 7) == 0 && (stride & 7) == 0) { +void CheckMask32(const u32 *src, int width, u32 *outMask) { + u32 mask = 0xFFFFFFFF; #ifdef _M_SSE - return CheckAlphaABGR1555SSE2(pixelData, stride, w, h); -#elif PPSSPP_ARCH(ARM_NEON) - return CheckAlphaABGR1555NEON(pixelData, stride, w, h); -#endif - } - - const u32 *p = pixelData; - const int w2 = (w + 1) / 2; - const int stride2 = (stride + 1) / 2; - - for (int y = 0; y < h; ++y) { - u32 bits = 0x00010001; - for (int i = 0; i < w2; ++i) { - bits &= p[i]; - } - - if (bits != 0x00010001) { - return CHECKALPHA_ANY; + if (width >= 4) { + __m128i wideMask = _mm_set1_epi32(0xFFFFFFFF); + while (width >= 4) { + wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src)); + src += 4; + width -= 4; } - - p += stride2; + mask = SSEReduce32And(wideMask); } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { - // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) - if ((w & 7) == 0 && (stride & 7) == 0) { -#ifdef _M_SSE - return CheckAlphaRGBA4444SSE2(pixelData, stride, w, h); #elif PPSSPP_ARCH(ARM_NEON) - return CheckAlphaRGBA4444NEON(pixelData, stride, w, h); -#endif - } - - const u32 *p = pixelData; - const int w2 = (w + 1) / 2; - const int stride2 = (stride + 1) / 2; - - for (int y = 0; y < h; ++y) { - u32 bits = 0xF000F000; - for (int i = 0; i < w2; ++i) { - bits &= p[i]; - } - - if (bits != 0xF000F000) { - // We're done, we hit non-full alpha. - return CHECKALPHA_ANY; + if (width >= 4) { + uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF); + while (width >= 4) { + wideMask = vandq_u32(wideMask, vld1q_u32(src)); + src += 4; + width -= 4; } - - p += stride2; + mask = NEONReduce32And(wideMask); } - - return CHECKALPHA_FULL; -} - -CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) { - // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) - if ((w & 7) == 0 && (stride & 7) == 0) { -#ifdef _M_SSE - return CheckAlphaRGBA5551SSE2(pixelData, stride, w, h); -#elif PPSSPP_ARCH(ARM_NEON) - return CheckAlphaRGBA5551NEON(pixelData, stride, w, h); #endif - } - const u32 *p = pixelData; - const int w2 = (w + 1) / 2; - const int stride2 = (stride + 1) / 2; - - for (int y = 0; y < h; ++y) { - u32 bits = 0x80008000; - for (int i = 0; i < w2; ++i) { - bits &= p[i]; - } - - if (bits != 0x80008000) { - return CHECKALPHA_ANY; - } - - p += stride2; + DO_NOT_VECTORIZE_LOOP + for (int i = 0; i < width; i++) { + mask &= src[i]; } - - return CHECKALPHA_FULL; + *outMask &= (u32)mask; } diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 9c13f6b9ddc4..172bb039f9d9 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -38,11 +38,11 @@ void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch u32 StableQuickTexHash(const void *checkp, u32 size); -CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h); +// outMask is an in/out parameter. +void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask); +void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask); +void CheckMask16(const u16 *src, int width, u32 *outMask); +void CheckMask32(const u32 *src, int width, u32 *outMask); // All these DXT structs are in the reverse order, as compared to PC. // On PC, alpha comes before color, and interpolants are before the tile data. @@ -98,6 +98,26 @@ inline bool AlphaSumIsFull(u32 alphaSum, u32 fullAlphaMask) { return fullAlphaMask != 0 && (alphaSum & fullAlphaMask) == fullAlphaMask; } +inline CheckAlphaResult CheckAlpha16(const u16 *pixelData, int width, u32 fullAlphaMask) { + u32 alphaSum = 0xFFFFFFFF; + CheckMask16(pixelData, width, &alphaSum); + return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; +} + +inline CheckAlphaResult CheckAlpha32(const u32 *pixelData, int width, u32 fullAlphaMask) { + u32 alphaSum = 0xFFFFFFFF; + CheckMask32(pixelData, width, &alphaSum); + return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; +} + +inline CheckAlphaResult CheckAlpha32Rect(const u32 *pixelData, int stride, int width, int height, u32 fullAlphaMask) { + u32 alphaSum = 0xFFFFFFFF; + for (int y = 0; y < height; y++) { + CheckMask32(pixelData + stride * y, width, &alphaSum); + } + return AlphaSumIsFull(alphaSum, fullAlphaMask) ? CHECKALPHA_FULL : CHECKALPHA_ANY; +} + template inline void DeIndexTexture(/*WRITEONLY*/ ClutT *dest, const IndexT *indexed, int length, const ClutT *clut, u32 *outAlphaSum) { // Usually, there is no special offset, mask, or shift. diff --git a/GPU/D3D11/TextureCacheD3D11.cpp b/GPU/D3D11/TextureCacheD3D11.cpp index ee428c0ab679..24be25aca8d9 100644 --- a/GPU/D3D11/TextureCacheD3D11.cpp +++ b/GPU/D3D11/TextureCacheD3D11.cpp @@ -418,8 +418,8 @@ void TextureCacheD3D11::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, GetClutDestFormatD3D11(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha(clutBuf_, GetClutDestFormatD3D11(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); } else { gstate_c.SetTextureFullAlpha(gstate.getTextureFormat() == GE_TFMT_5650); framebufferManager_->RebindFramebuffer("RebindFramebuffer - ApplyTextureFramebuffer"); @@ -591,25 +591,18 @@ DXGI_FORMAT TextureCacheD3D11::GetDestFormat(GETextureFormat format, GEPaletteFo } } -TexCacheEntry::TexStatus TextureCacheD3D11::CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h) { - CheckAlphaResult res; +CheckAlphaResult TextureCacheD3D11::CheckAlpha(const u32 *pixelData, u32 dstFmt, int w) { switch (dstFmt) { case DXGI_FORMAT_B4G4R4A4_UNORM: - res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0xF000); case DXGI_FORMAT_B5G5R5A1_UNORM: - res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0x8000); case DXGI_FORMAT_B5G6R5_UNORM: // Never has any alpha. - res = CHECKALPHA_FULL; - break; + return CHECKALPHA_FULL; default: - res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); - break; + return CheckAlpha32((const u32 *)pixelData, w, 0xFF000000); } - - return (TexCacheEntry::TexStatus)res; } ReplacedTextureFormat FromD3D11Format(u32 fmt) { diff --git a/GPU/D3D11/TextureCacheD3D11.h b/GPU/D3D11/TextureCacheD3D11.h index f72da416ec22..8ed4d417f4e2 100644 --- a/GPU/D3D11/TextureCacheD3D11.h +++ b/GPU/D3D11/TextureCacheD3D11.h @@ -70,7 +70,7 @@ class TextureCacheD3D11 : public TextureCacheCommon { private: void LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int maxLevel, int scaleFactor, DXGI_FORMAT dstFmt); DXGI_FORMAT GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; - static TexCacheEntry::TexStatus CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h); + static CheckAlphaResult CheckAlpha(const u32 *pixelData, u32 dstFmt, int w); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; void ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, FramebufferNotificationChannel channel) override; diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index fb2efaa8e8bc..3051c9b33f09 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -375,8 +375,8 @@ void TextureCacheDX9::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, G const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, getClutDestFormat(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha(clutBuf_, getClutDestFormat(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); } else { framebufferManagerDX9_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); @@ -525,25 +525,18 @@ D3DFORMAT TextureCacheDX9::GetDestFormat(GETextureFormat format, GEPaletteFormat } } -TexCacheEntry::TexStatus TextureCacheDX9::CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h) { - CheckAlphaResult res; +CheckAlphaResult TextureCacheDX9::CheckAlpha(const u32 *pixelData, u32 dstFmt, int w) { switch (dstFmt) { case D3DFMT_A4R4G4B4: - res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0xF000); case D3DFMT_A1R5G5B5: - res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0x8000); case D3DFMT_R5G6B5: // Never has any alpha. - res = CHECKALPHA_FULL; - break; + return CHECKALPHA_FULL; default: - res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); - break; + return CheckAlpha32(pixelData, w, 0xFF000000); } - - return (TexCacheEntry::TexStatus)res; } ReplacedTextureFormat FromD3D9Format(u32 fmt) { diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h index bb1b58f79c76..190e81a06bc9 100644 --- a/GPU/Directx9/TextureCacheDX9.h +++ b/GPU/Directx9/TextureCacheDX9.h @@ -64,7 +64,7 @@ class TextureCacheDX9 : public TextureCacheCommon { void LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int maxLevel, int scaleFactor, u32 dstFmt); D3DFORMAT GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; - static TexCacheEntry::TexStatus CheckAlpha(const u32 *pixelData, u32 dstFmt, int stride, int w, int h); + static CheckAlphaResult CheckAlpha(const u32 *pixelData, u32 dstFmt, int w); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; void ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, FramebufferNotificationChannel channel) override; diff --git a/GPU/GLES/TextureCacheGLES.cpp b/GPU/GLES/TextureCacheGLES.cpp index 7ec34e67117d..6551563bce45 100644 --- a/GPU/GLES/TextureCacheGLES.cpp +++ b/GPU/GLES/TextureCacheGLES.cpp @@ -373,8 +373,8 @@ void TextureCacheGLES::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, gstate_c.depalFramebufferFormat = framebuffer->drawnFormat; const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); return; } @@ -407,8 +407,8 @@ void TextureCacheGLES::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); } else { framebufferManagerGL_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); @@ -614,25 +614,18 @@ Draw::DataFormat TextureCacheGLES::GetDestFormat(GETextureFormat format, GEPalet } } -TexCacheEntry::TexStatus TextureCacheGLES::CheckAlpha(const uint8_t *pixelData, Draw::DataFormat dstFmt, int stride, int w, int h) { - CheckAlphaResult res; +CheckAlphaResult TextureCacheGLES::CheckAlpha(const uint8_t *pixelData, Draw::DataFormat dstFmt, int w) { switch (dstFmt) { case Draw::DataFormat::R4G4B4A4_UNORM_PACK16: - res = CheckAlphaABGR4444Basic((const uint32_t *)pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0x000F); case Draw::DataFormat::R5G5B5A1_UNORM_PACK16: - res = CheckAlphaABGR1555Basic((const uint32_t *)pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0x0001); case Draw::DataFormat::R5G6B5_UNORM_PACK16: // Never has any alpha. - res = CHECKALPHA_FULL; - break; + return CHECKALPHA_FULL; default: - res = CheckAlphaRGBA8888Basic((const uint32_t *)pixelData, stride, w, h); - break; + return CheckAlpha32((const u32 *)pixelData, w, 0xFF000000); // note, the normal order here, unlike the 16-bit formats } - - return (TexCacheEntry::TexStatus)res; } void TextureCacheGLES::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int scaleFactor, Draw::DataFormat dstFmt) { diff --git a/GPU/GLES/TextureCacheGLES.h b/GPU/GLES/TextureCacheGLES.h index dc01af9c664c..7327d4877bb0 100644 --- a/GPU/GLES/TextureCacheGLES.h +++ b/GPU/GLES/TextureCacheGLES.h @@ -73,7 +73,7 @@ class TextureCacheGLES : public TextureCacheCommon { void LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int scaleFactor, Draw::DataFormat dstFmt); Draw::DataFormat GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; - static TexCacheEntry::TexStatus CheckAlpha(const uint8_t *pixelData, Draw::DataFormat dstFmt, int stride, int w, int h); + static CheckAlphaResult CheckAlpha(const uint8_t *pixelData, Draw::DataFormat dstFmt, int w); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; void ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, FramebufferNotificationChannel channel) override; diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index 0453b648506f..29f06d09ee92 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -428,8 +428,8 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer gstate_c.depalFramebufferFormat = framebuffer->drawnFormat; const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, getClutDestFormatVulkan(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha(clutBuf_, getClutDestFormatVulkan(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); curSampler_ = samplerCache_.GetOrCreateSampler(samplerKey); if (framebufferManager_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET)) { imageView_ = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::BOUND_TEXTURE0_IMAGEVIEW); @@ -532,8 +532,8 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, getClutDestFormatVulkan(clutFormat), clutTotalColors, clutTotalColors, 1); - gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + CheckAlphaResult alphaStatus = CheckAlpha(clutBuf_, getClutDestFormatVulkan(clutFormat), clutTotalColors); + gstate_c.SetTextureFullAlpha(alphaStatus == CHECKALPHA_FULL); framebufferManager_->RebindFramebuffer("RebindFramebuffer - ApplyTextureFramebuffer"); draw_->BindFramebufferAsTexture(depalFBO, 0, Draw::FB_COLOR_BIT, 0); @@ -936,25 +936,18 @@ VkFormat TextureCacheVulkan::GetDestFormat(GETextureFormat format, GEPaletteForm } } -TexCacheEntry::TexStatus TextureCacheVulkan::CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int stride, int w, int h) { - CheckAlphaResult res; +CheckAlphaResult TextureCacheVulkan::CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int w) { switch (dstFmt) { case VULKAN_4444_FORMAT: - res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0xF000); case VULKAN_1555_FORMAT: - res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h); - break; + return CheckAlpha16((const u16 *)pixelData, w, 0x8000); case VULKAN_565_FORMAT: // Never has any alpha. - res = CHECKALPHA_FULL; - break; + return CHECKALPHA_FULL; default: - res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); - break; + return CheckAlpha32(pixelData, w, 0xFF000000); } - - return (TexCacheEntry::TexStatus)res; } void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePtr, int rowPitch, int level, int scaleFactor, VkFormat dstFmt) { diff --git a/GPU/Vulkan/TextureCacheVulkan.h b/GPU/Vulkan/TextureCacheVulkan.h index 5c9b7a91fffb..599ad0bce7e8 100644 --- a/GPU/Vulkan/TextureCacheVulkan.h +++ b/GPU/Vulkan/TextureCacheVulkan.h @@ -107,7 +107,7 @@ class TextureCacheVulkan : public TextureCacheCommon { private: void LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePtr, int rowPitch, int level, int scaleFactor, VkFormat dstFmt); VkFormat GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; - static TexCacheEntry::TexStatus CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int stride, int w, int h); + static CheckAlphaResult CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int w); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; void ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, FramebufferNotificationChannel channel) override;