diff --git a/.gitmodules b/.gitmodules index 7513c5e5e3e5..f90859d1d303 100644 --- a/.gitmodules +++ b/.gitmodules @@ -31,3 +31,6 @@ [submodule "ext/miniupnp"] path = ext/miniupnp url = https://github.com/hrydgard/miniupnp.git +[submodule "ext/wiiu"] + path = ext/wiiu + url = https://github.com/aliaspider/wiiu.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac71497648d..bac153fcc0b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,9 @@ # vim:noexpandtab: cmake_minimum_required(VERSION 3.6) +if (WIIU) + set(CMAKE_TOOLCHAIN_FILE ${CMAKE_SOURCE_DIR}/cmake/Toolchains/wiiu.cmake) +endif() + project(PPSSPP) #This is supposed to work but doesn't! @@ -52,6 +56,8 @@ if(CMAKE_SYSTEM_PROCESSOR) set(X86_DEVICE ON) elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^mips") set(MIPS_DEVICE ON) + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "^powerpc") + set(PPC_DEVICE ON) else() message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") endif() @@ -102,6 +108,7 @@ option(ARM "Set to ON if targeting an ARM processor" ${ARM_DEVICE}) option(MIPS "Set to ON if targeting a MIPS processor" ${MIPS_DEVICE}) option(X86 "Set to ON if targeting an X86 processor" ${X86_DEVICE}) option(X86_64 "Set to ON if targeting an X86_64 processor" ${X86_64_DEVICE}) +option(PPC "Set to ON if targeting a PPC processor" ${PPC_DEVICE}) # :: Environments option(USING_EGL "Set to ON if target environment uses EGL" ${USING_EGL}) option(USING_FBDEV "Set to ON if target environment uses fbdev (eg. Pandora)" ${USING_FBDEV}) @@ -169,26 +176,27 @@ include_directories(ext/native) # Work around for some misfeature of the current glslang build system include_directories(ext/glslang) +if(NOT WIIU) + if(NOT OPENGL_LIBRARIES AND USING_GLES2) + set(OPENGL_LIBRARIES GLESv2 EGL) + endif() -if(NOT OPENGL_LIBRARIES AND USING_GLES2) - set(OPENGL_LIBRARIES GLESv2 EGL) -endif() - -if(NOT OPENGL_LIBRARIES) - find_package(OpenGL REQUIRED) -endif() + if(NOT OPENGL_LIBRARIES) + find_package(OpenGL REQUIRED) + endif() -if(USING_EGL) - if(NOT EGL_LIBRARIES) - set(EGL_LIBRARIES EGL) + if(USING_EGL) + if(NOT EGL_LIBRARIES) + set(EGL_LIBRARIES EGL) + endif() + set(OPENGL_LIBRARIES ${OPENGL_LIBRARIES} ${EGL_LIBRARIES}) endif() - set(OPENGL_LIBRARIES ${OPENGL_LIBRARIES} ${EGL_LIBRARIES}) -endif() -if(NOT LIBRETRO) - find_package(SDL2) + if(NOT LIBRETRO) + find_package(SDL2) + endif() + include(FindThreads) endif() -include(FindThreads) if(APPLE) find_library(COCOA_LIBRARY Cocoa) @@ -239,6 +247,9 @@ endif() if(ARM64) message("Generating for ARMv8, ${CMAKE_BUILD_TYPE}") endif() +if(PPC) + message("Generating for PowerPC, ${CMAKE_BUILD_TYPE}") +endif() # It looks like the flags for the selected build type are written to the cache after each run, which causes some of the operations # below to keep expanding them with the same flags over and over on every run, leading to a rebuild of the majority of the files. @@ -392,7 +403,13 @@ set(CommonMIPS ) source_group(MIPS FILES ${CommonMIPS}) -if(NOT (X86 OR ARM OR MIPS)) +set(CommonPPC + Common/FakeCPUDetect.cpp + Common/ppcEmitter.cpp + Common/ppcEmitter.h) +source_group(PPC FILES ${CommonPPC}) + +if(NOT (X86 OR ARM OR MIPS OR PPC)) set(CommonFake Common/FakeCPUDetect.cpp Common/FakeEmitter.h @@ -406,6 +423,7 @@ set(CommonWindows ) source_group(Windows FILES ${CommonWindows}) + set(CommonVulkan ${CommonExtra} Common/Vulkan/VulkanDebug.cpp Common/Vulkan/VulkanDebug.h @@ -424,6 +442,7 @@ add_library(Common STATIC ${CommonARM} ${CommonARM64} ${CommonMIPS} + ${CommonPPC} ${CommonFake} ${CommonWindows} ${CommonVulkan} @@ -460,6 +479,7 @@ add_library(Common STATIC Common/MemArenaDarwin.cpp Common/MemArenaPosix.cpp Common/MemArenaWin32.cpp + Common/MemArenaWiiU.cpp Common/MemArena.h Common/MemoryUtil.cpp Common/MemoryUtil.h @@ -527,6 +547,8 @@ if(USE_FFMPEG) set(PLATFORM_ARCH "ios/universal") elseif(MACOSX) set(PLATFORM_ARCH "macosx/x86_64") + elseif(WIIU) + set(PLATFORM_ARCH "wiiu") elseif(LINUX) if(ARMV7) set(PLATFORM_ARCH "linux/armv7") @@ -799,6 +821,16 @@ elseif(IOS) set_source_files_properties(ios/CameraHelper.mm PROPERTIES COMPILE_FLAGS -fobjc-arc) set_source_files_properties(ios/LocationHelper.mm PROPERTIES COMPILE_FLAGS -fobjc-arc) + set(TargetBin PPSSPP) +elseif(WIIU) + add_definitions(-D_GNU_SOURCE) + set(nativeExtra ${nativeExtra} + WiiU/WiiUHost.h + WiiU/WiiUHost.cpp + WiiU/WiiUMain.cpp + WiiU/GX2GraphicsContext.h + WiiU/GX2GraphicsContext.cpp) + set(nativeExtraLibs ${nativeExtraLibs} wiiu fat iosuhax) set(TargetBin PPSSPP) elseif(USING_QT_UI) set(CMAKE_AUTOMOC ON) @@ -907,14 +939,20 @@ if(ANDROID) endif() endif() -set(THIN3D_PLATFORMS ext/native/thin3d/thin3d_gl.cpp - ext/native/thin3d/GLRenderManager.cpp - ext/native/thin3d/GLRenderManager.h - ext/native/thin3d/GLQueueRunner.cpp - ext/native/thin3d/GLQueueRunner.h - ext/native/thin3d/DataFormatGL.cpp - ext/native/thin3d/DataFormatGL.h -) +if(WIIU) + set(THIN3D_PLATFORMS ext/native/thin3d/thin3d_gx2.cpp + ext/native/thin3d/GX2Shaders.c + ) +else() + set(THIN3D_PLATFORMS ext/native/thin3d/thin3d_gl.cpp + ext/native/thin3d/GLRenderManager.cpp + ext/native/thin3d/GLRenderManager.h + ext/native/thin3d/GLQueueRunner.cpp + ext/native/thin3d/GLQueueRunner.h + ext/native/thin3d/DataFormatGL.cpp + ext/native/thin3d/DataFormatGL.h + ) +endif() set(THIN3D_PLATFORMS ${THIN3D_PLATFORMS} ext/native/thin3d/thin3d_vulkan.cpp @@ -937,6 +975,35 @@ if(WIN32) ) endif() +set(nativeExtra ${nativeExtra} +ext/native/gfx_es2/draw_buffer.cpp +ext/native/gfx_es2/draw_buffer.h +ext/native/gfx_es2/draw_text.cpp +ext/native/gfx_es2/draw_text.h +ext/native/gfx_es2/gpu_features.cpp +ext/native/gfx_es2/gpu_features.h) + +if(NOT WIIU) + set(nativeExtra ${nativeExtra} + ext/native/gfx/gl_common.h + ext/native/gfx/gl_debug_log.cpp + ext/native/gfx/gl_debug_log.h + ext/native/gfx/d3d9_shader.cpp + ext/native/gfx/d3d9_shader.h + ext/native/gfx/d3d9_state.cpp + ext/native/gfx/d3d9_state.h + ext/native/gfx_es2/draw_text_win.cpp + ext/native/gfx_es2/draw_text_win.h + ext/native/gfx_es2/draw_text_qt.cpp + ext/native/gfx_es2/draw_text_qt.h + ext/native/gfx_es2/draw_text_android.cpp + ext/native/gfx_es2/draw_text_android.h + ext/native/gfx_es2/glsl_program.cpp + ext/native/gfx_es2/glsl_program.h + ext/native/gfx_es2/gl3stub.c + ext/native/gfx_es2/gl3stub.h) +endif() + add_library(native STATIC ${nativeExtra} ext/native/base/backtrace.cpp @@ -970,31 +1037,8 @@ add_library(native STATIC ext/native/file/vfs.h ext/native/file/zip_read.cpp ext/native/file/zip_read.h - ext/native/gfx/gl_common.h - ext/native/gfx/gl_debug_log.cpp - ext/native/gfx/gl_debug_log.h ext/native/gfx/texture_atlas.cpp ext/native/gfx/texture_atlas.h - ext/native/gfx/d3d9_shader.cpp - ext/native/gfx/d3d9_shader.h - ext/native/gfx/d3d9_state.cpp - ext/native/gfx/d3d9_state.h - ext/native/gfx_es2/draw_buffer.cpp - ext/native/gfx_es2/draw_buffer.h - ext/native/gfx_es2/draw_text.cpp - ext/native/gfx_es2/draw_text.h - ext/native/gfx_es2/draw_text_win.cpp - ext/native/gfx_es2/draw_text_win.h - ext/native/gfx_es2/draw_text_qt.cpp - ext/native/gfx_es2/draw_text_qt.h - ext/native/gfx_es2/draw_text_android.cpp - ext/native/gfx_es2/draw_text_android.h - ext/native/gfx_es2/gpu_features.cpp - ext/native/gfx_es2/gpu_features.h - ext/native/gfx_es2/glsl_program.cpp - ext/native/gfx_es2/glsl_program.h - ext/native/gfx_es2/gl3stub.c - ext/native/gfx_es2/gl3stub.h ext/native/i18n/i18n.cpp ext/native/i18n/i18n.h ext/native/image/png_load.cpp @@ -1233,9 +1277,28 @@ list(APPEND CoreExtra GPU/Software/SamplerX86.cpp ) +list(APPEND CoreExtra + Core/MIPS/PPC/PpcAsm.cpp + Core/MIPS/PPC/PpcCompAlu.cpp + Core/MIPS/PPC/PpcCompBranch.cpp + Core/MIPS/PPC/PpcCompFpu.cpp + Core/MIPS/PPC/PpcCompLoadStore.cpp + Core/MIPS/PPC/PpcCompReplace.cpp + Core/MIPS/PPC/PpcCompVFPU.cpp + Core/MIPS/PPC/PpcJit.cpp + Core/MIPS/PPC/PpcJit.h + Core/MIPS/PPC/PpcRegCache.cpp + Core/MIPS/PPC/PpcRegCache.h + Core/MIPS/PPC/PpcRegCacheFPU.cpp + Core/MIPS/PPC/PpcRegCacheFPU.h + Core/MIPS/PPC/PpcRegCacheVPU.cpp + Core/MIPS/PPC/PpcRegCacheVPU.h) + list(APPEND CoreExtra Core/MIPS/MIPS/MipsJit.cpp Core/MIPS/MIPS/MipsJit.h + Core/MIPS/fake/FakeJit.cpp + Core/MIPS/fake/FakeJit.h GPU/Common/VertexDecoderFake.cpp ) @@ -1355,8 +1418,43 @@ set(GPU_D3D11 GPU/D3D11/VertexShaderGeneratorD3D11.h ) -# We build Vulkan even on Apple to avoid annoying build differences. -set(GPU_IMPLS ${GPU_GLES} ${GPU_VULKAN}) +set(GPU_GX2 + GPU/GX2/DepalettizeShaderGX2.cpp + GPU/GX2/DepalettizeShaderGX2.h + GPU/GX2/DrawEngineGX2.cpp + GPU/GX2/DrawEngineGX2.h + GPU/GX2/FragmentShaderGeneratorGX2.cpp + GPU/GX2/FragmentShaderGeneratorGX2.h + GPU/GX2/FramebufferManagerGX2.cpp + GPU/GX2/FramebufferManagerGX2.h + GPU/GX2/GPU_GX2.cpp + GPU/GX2/GPU_GX2.h + GPU/GX2/GX2Shaders.c + GPU/GX2/GX2Shaders.h + GPU/GX2/GX2Util.cpp + GPU/GX2/GX2Util.h + GPU/GX2/ShaderManagerGX2.cpp + GPU/GX2/ShaderManagerGX2.h + GPU/GX2/StateMappingGX2.cpp + GPU/GX2/StateMappingGX2.h + GPU/GX2/StencilBufferGX2.cpp + GPU/GX2/TextureCacheGX2.cpp + GPU/GX2/TextureCacheGX2.h + GPU/GX2/TextureScalerGX2.cpp + GPU/GX2/TextureScalerGX2.h + GPU/GX2/VertexShaderGeneratorGX2.cpp + GPU/GX2/VertexShaderGeneratorGX2.h +) + +# We build Vulkan even on Apple and WiiU to avoid annoying build differences. +set(GPU_IMPLS ${GPU_VULKAN}) + +if(WIIU) + list(APPEND GPU_IMPLS ${GPU_GX2}) +else() + list(APPEND GPU_IMPLS ${GPU_GLES}) +endif() + if(WIN32) list(APPEND GPU_IMPLS ${GPU_D3D9} ${GPU_D3D11}) endif() @@ -1367,8 +1465,6 @@ endif() set(GPU_SOURCES ${GPU_IMPLS} ${GPU_NEON} - GPU/Common/DepalettizeShaderCommon.cpp - GPU/Common/DepalettizeShaderCommon.h GPU/Common/FramebufferManagerCommon.cpp GPU/Common/FramebufferManagerCommon.h GPU/Common/GPUDebugInterface.cpp @@ -1379,14 +1475,6 @@ set(GPU_SOURCES GPU/Common/DrawEngineCommon.h GPU/Common/PresentationCommon.cpp GPU/Common/PresentationCommon.h - GPU/Common/ShaderId.cpp - GPU/Common/ShaderId.h - GPU/Common/ShaderUniforms.cpp - GPU/Common/ShaderUniforms.h - GPU/Common/ShaderCommon.cpp - GPU/Common/ShaderCommon.h - GPU/Common/ShaderTranslation.cpp - GPU/Common/ShaderTranslation.h GPU/Common/SplineCommon.cpp GPU/Common/SplineCommon.h GPU/Common/StencilCommon.cpp @@ -1395,6 +1483,16 @@ set(GPU_SOURCES GPU/Common/SoftwareTransformCommon.h GPU/Common/VertexDecoderCommon.cpp GPU/Common/VertexDecoderCommon.h + GPU/Common/DepalettizeShaderCommon.cpp + GPU/Common/DepalettizeShaderCommon.h + GPU/Common/ShaderId.cpp + GPU/Common/ShaderId.h + GPU/Common/ShaderCommon.cpp + GPU/Common/ShaderCommon.h + GPU/Common/ShaderUniforms.cpp + GPU/Common/ShaderUniforms.h + GPU/Common/ShaderTranslation.cpp + GPU/Common/ShaderTranslation.h GPU/Common/TransformCommon.cpp GPU/Common/TransformCommon.h GPU/Common/IndexGenerator.cpp @@ -1819,7 +1917,11 @@ set(CoreExtraLibs ${CoreExtraLibs} armips) #endif() -set(GlslangLibs glslang OGLCompiler OSDependent SPIRV SPVRemapper spirv-cross-glsl) +if(WIIU) + set(CMAKE_DL_LIBS "") +else() + set(GlslangLibs glslang OGLCompiler OSDependent SPIRV SPVRemapper spirv-cross-glsl) +endif() if(WIN32) set(GlslangLibs ${GlslangLibs} spirv-cross-hlsl) @@ -1838,16 +1940,19 @@ if(FFmpeg_FOUND) FFmpeg::swscale ${ZLIB_LIBRARY} ) + if(WIIU) + target_link_libraries(${CoreLibName} pthread) + endif() endif() # Discord integration -if(USE_DISCORD AND NOT IOS AND NOT LIBRETRO) +if(USE_DISCORD AND NOT IOS AND NOT WIIU AND NOT LIBRETRO) add_definitions(-DUSE_DISCORD=1) target_link_libraries(${CoreLibName} discord-rpc) endif() # miniUPnPc integration (MiniUPnPc supposed to works on any POSIX system, not sure if some of these are redundant/not needed tho) -if(USE_MINIUPNPC) +if(USE_MINIUPNPC AND NOT WIIU) set (MINIUPNPC_VERSION 2.1) # used by miniupnpcstrings.h.cmake set (MINIUPNPC_API_VERSION 17) option(UPNPC_BUILD_STATIC "Build static library" TRUE) @@ -2194,3 +2299,7 @@ if(IOS) XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY "-" ) endif() + +if(WIIU) + add_rpx_target(${TargetBin}) +endif() diff --git a/Common/ColorConv.cpp b/Common/ColorConv.cpp index 0146967e99c3..e83b5b4fc6ef 100644 --- a/Common/ColorConv.cpp +++ b/Common/ColorConv.cpp @@ -95,7 +95,7 @@ void convert5551_gl(u16* data, u32* out, int width, int l, int u) { } // convert 4444 image to 8888, parallelizable -void convert4444_dx9(u16* data, u32* out, int width, int l, int u) { +void convert4444_dx9(u16_le* data, u32* out, int width, int l, int u) { for (int y = l; y < u; ++y) { for (int x = 0; x < width; ++x) { u32 val = data[y*width + x]; @@ -109,7 +109,7 @@ void convert4444_dx9(u16* data, u32* out, int width, int l, int u) { } // convert 565 image to 8888, parallelizable -void convert565_dx9(u16* data, u32* out, int width, int l, int u) { +void convert565_dx9(u16_le* data, u32* out, int width, int l, int u) { for (int y = l; y < u; ++y) { for (int x = 0; x < width; ++x) { u32 val = data[y*width + x]; @@ -122,7 +122,7 @@ void convert565_dx9(u16* data, u32* out, int width, int l, int u) { } // convert 5551 image to 8888, parallelizable -void convert5551_dx9(u16* data, u32* out, int width, int l, int u) { +void convert5551_dx9(u16_le* data, u32* out, int width, int l, int u) { for (int y = l; y < u; ++y) { for (int x = 0; x < width; ++x) { u32 val = data[y*width + x]; @@ -137,7 +137,7 @@ void convert5551_dx9(u16* data, u32* out, int width, int l, int u) { -void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) { +void ConvertBGRA8888ToRGBA8888(u32_le *dst, const u32_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i maskGA = _mm_set1_epi32(0xFF00FF00); @@ -170,7 +170,7 @@ void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) { } } -void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertRGBA8888ToRGBA5551(u16_le *dst, const u32_le *src, u32 numPixels) { #if _M_SSE >= 0x401 const __m128i maskAG = _mm_set1_epi32(0x8000F800); const __m128i maskRB = _mm_set1_epi32(0x00F800F8); @@ -212,7 +212,7 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { } } -void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertBGRA8888ToRGBA5551(u16_le *dst, const u32_le *src, u32 numPixels) { #if _M_SSE >= 0x401 const __m128i maskAG = _mm_set1_epi32(0x8000F800); const __m128i maskRB = _mm_set1_epi32(0x00F800F8); @@ -254,31 +254,31 @@ void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { } } -void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertBGRA8888ToRGB565(u16_le *dst, const u32_le *src, u32 numPixels) { for (u32 i = 0; i < numPixels; i++) { dst[i] = BGRA8888toRGB565(src[i]); } } -void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertBGRA8888ToRGBA4444(u16_le *dst, const u32_le *src, u32 numPixels) { for (u32 i = 0; i < numPixels; i++) { dst[i] = BGRA8888toRGBA4444(src[i]); } } -void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertRGBA8888ToRGB565(u16_le *dst, const u32_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; ++x) { dst[x] = RGBA8888toRGB565(src[x]); } } -void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) { +void ConvertRGBA8888ToRGBA4444(u16_le *dst, const u32_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; ++x) { dst[x] = RGBA8888toRGBA4444(src[x]); } } -void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { +void ConvertRGB565ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i mask5 = _mm_set1_epi16(0x001f); const __m128i mask6 = _mm_set1_epi16(0x003f); @@ -322,17 +322,16 @@ void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { u32 i = 0; #endif - u8 *dst = (u8 *)dst32; for (u32 x = i; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert5To8((col) & 0x1f); - dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f); - dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f); - dst[x * 4 + 3] = 255; + dst32[x] = Convert5To8((col) & 0x1f); + dst32[x] |= Convert6To8((col >> 5) & 0x3f) << 8; + dst32[x] |= Convert5To8((col >> 11) & 0x1f) << 16; + dst32[x] |= 255 << 24; } } -void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { +void ConvertRGBA5551ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i mask5 = _mm_set1_epi16(0x001f); const __m128i mask8 = _mm_set1_epi16(0x00ff); @@ -376,17 +375,16 @@ void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { u32 i = 0; #endif - u8 *dst = (u8 *)dst32; for (u32 x = i; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert5To8((col) & 0x1f); - dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f); - dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f); - dst[x * 4 + 3] = (col >> 15) ? 255 : 0; + dst32[x] = Convert5To8((col) & 0x1f); + dst32[x] |= Convert5To8((col >> 5) & 0x1f) << 8; + dst32[x] |= Convert5To8((col >> 10) & 0x1f) << 16; + dst32[x] |= (col >> 15) ? 255 << 24 : 0; } } -void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { +void ConvertRGBA4444ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i mask4 = _mm_set1_epi16(0x000f); @@ -425,51 +423,46 @@ void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { u32 i = 0; #endif - u8 *dst = (u8 *)dst32; for (u32 x = i; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert4To8(col & 0xf); - dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf); - dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf); - dst[x * 4 + 3] = Convert4To8(col >> 12); + dst32[x] = Convert4To8(col & 0xf); + dst32[x] |= Convert4To8((col >> 4) & 0xf) << 8; + dst32[x] |= Convert4To8((col >> 8) & 0xf) << 16; + dst32[x] |= Convert4To8(col >> 12) << 24; } } -void ConvertBGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { - u8 *dst = (u8 *)dst32; +void ConvertBGR565ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert5To8((col >> 11) & 0x1f); - dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f); - dst[x * 4 + 2] = Convert5To8((col) & 0x1f); - dst[x * 4 + 3] = 255; + dst32[x] = Convert5To8((col >> 11) & 0x1f); + dst32[x] |= Convert6To8((col >> 5) & 0x3f) << 8; + dst32[x] |= Convert5To8((col) & 0x1f) << 16; + dst32[x] |= 255 << 24; } } -void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { - u8 *dst = (u8 *)dst32; +void ConvertABGR1555ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert5To8((col >> 11) & 0x1f); - dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f); - dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f); - dst[x * 4 + 3] = (col & 1) ? 255 : 0; + dst32[x] = Convert5To8((col >> 11) & 0x1f); + dst32[x] |= Convert5To8((col >> 6) & 0x1f) << 8; + dst32[x] |= Convert5To8((col >> 1) & 0x1f) << 16; + dst32[x] |= (col & 1) ? 255 << 24 : 0; } } -void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) { - u8 *dst = (u8 *)dst32; +void ConvertABGR4444ToRGBA8888(u32_le *dst32, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 col = src[x]; - dst[x * 4] = Convert4To8(col >> 12); - dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf); - dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf); - dst[x * 4 + 3] = Convert4To8(col & 0xf); + dst32[x] = Convert4To8(col >> 12); + dst32[x] |= Convert4To8((col >> 8) & 0xf) << 8; + dst32[x] |= Convert4To8((col >> 4) & 0xf) << 16; + dst32[x] |= Convert4To8(col & 0xf) << 24; } } -void ConvertRGBA4444ToBGRA8888(u32 *dst32, const u16 *src, u32 numPixels) { - u8 *dst = (u8 *)dst32; +void ConvertRGBA4444ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 c = src[x]; u32 r = Convert4To8(c & 0x000f); @@ -481,7 +474,7 @@ void ConvertRGBA4444ToBGRA8888(u32 *dst32, const u16 *src, u32 numPixels) { } } -void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) { +void ConvertRGBA5551ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 c = src[x]; u32 r = Convert5To8(c & 0x001f); @@ -494,7 +487,7 @@ void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) { } } -void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) { +void ConvertRGB565ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels) { for (u32 x = 0; x < numPixels; x++) { u16 c = src[x]; u32 r = Convert5To8(c & 0x001f); @@ -505,7 +498,7 @@ void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) { } } -void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels) { +void ConvertRGBA4444ToABGR4444Basic(u16_le *dst, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i mask0040 = _mm_set1_epi16(0x00F0); @@ -529,8 +522,8 @@ void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels) { u32 i = 0; #endif - const u32 *src32 = (const u32 *)src; - u32 *dst32 = (u32 *)dst; + const u32_le *src32 = (const u32_le *)src; + u32_le *dst32 = (u32_le *)dst; for (; i < numPixels / 2; i++) { const u32 c = src32[i]; dst32[i] = ((c >> 12) & 0x000F000F) | @@ -549,7 +542,7 @@ void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels) { } } -void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels) { +void ConvertRGBA5551ToABGR1555Basic(u16_le *dst, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i maskB = _mm_set1_epi16(0x003E); const __m128i maskG = _mm_set1_epi16(0x07C0); @@ -574,8 +567,8 @@ void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels) { u32 i = 0; #endif - const u32 *src32 = (const u32 *)src; - u32 *dst32 = (u32 *)dst; + const u32_le *src32 = (const u32_le *)src; + u32_le *dst32 = (u32_le *)dst; for (; i < numPixels / 2; i++) { const u32 c = src32[i]; dst32[i] = ((c >> 15) & 0x00010001) | @@ -594,7 +587,7 @@ void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels) { } } -void ConvertRGB565ToBGR565Basic(u16 *dst, const u16 *src, u32 numPixels) { +void ConvertRGB565ToBGR565Basic(u16_le *dst, const u16_le *src, u32 numPixels) { #ifdef _M_SSE const __m128i maskG = _mm_set1_epi16(0x07E0); @@ -617,8 +610,8 @@ void ConvertRGB565ToBGR565Basic(u16 *dst, const u16 *src, u32 numPixels) { u32 i = 0; #endif - const u32 *src32 = (const u32 *)src; - u32 *dst32 = (u32 *)dst; + const u32_le *src32 = (const u32_le *)src; + u32_le *dst32 = (u32_le *)dst; for (; i < numPixels / 2; i++) { const u32 c = src32[i]; dst32[i] = ((c >> 11) & 0x001F001F) | diff --git a/Common/ColorConv.h b/Common/ColorConv.h index e2ae708bf7c5..b24b1c495fe4 100644 --- a/Common/ColorConv.h +++ b/Common/ColorConv.h @@ -20,6 +20,7 @@ #include "ppsspp_config.h" #include "CommonTypes.h" #include "ColorConvNEON.h" +#include "Swap.h" void SetupColorConv(); @@ -101,46 +102,46 @@ inline u16 RGBA8888ToRGBA4444(u32 value) { void convert4444_gl(u16* data, u32* out, int width, int l, int u); void convert565_gl(u16* data, u32* out, int width, int l, int u); void convert5551_gl(u16* data, u32* out, int width, int l, int u); -void convert4444_dx9(u16* data, u32* out, int width, int l, int u); -void convert565_dx9(u16* data, u32* out, int width, int l, int u); -void convert5551_dx9(u16* data, u32* out, int width, int l, int u); +void convert4444_dx9(u16_le* data, u32* out, int width, int l, int u); +void convert565_dx9(u16_le* data, u32* out, int width, int l, int u); +void convert5551_dx9(u16_le* data, u32* out, int width, int l, int u); // "Complete" set of color conversion functions between the usual formats. // TODO: Need to revisit the naming convention of these. Seems totally backwards // now that we've standardized on Draw::DataFormat. -typedef void (*Convert16bppTo16bppFunc)(u16 *dst, const u16 *src, u32 numPixels); -typedef void (*Convert16bppTo32bppFunc)(u32 *dst, const u16 *src, u32 numPixels); -typedef void (*Convert32bppTo16bppFunc)(u16 *dst, const u32 *src, u32 numPixels); -typedef void (*Convert32bppTo32bppFunc)(u32 *dst, const u32 *src, u32 numPixels); +typedef void (*Convert16bppTo16bppFunc)(u16_le *dst, const u16_le *src, u32 numPixels); +typedef void (*Convert16bppTo32bppFunc)(u32_le *dst, const u16_le *src, u32 numPixels); +typedef void (*Convert32bppTo16bppFunc)(u16_le *dst, const u32_le *src, u32 numPixels); +typedef void (*Convert32bppTo32bppFunc)(u32_le *dst, const u32_le *src, u32 numPixels); -void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels); +void ConvertBGRA8888ToRGBA8888(u32_le *dst, const u32_le *src, u32 numPixels); #define ConvertRGBA8888ToBGRA8888 ConvertBGRA8888ToRGBA8888 -void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels); -void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels); -void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels); +void ConvertRGBA8888ToRGBA5551(u16_le *dst, const u32_le *src, u32 numPixels); +void ConvertRGBA8888ToRGB565(u16_le *dst, const u32_le *src, u32 numPixels); +void ConvertRGBA8888ToRGBA4444(u16_le *dst, const u32_le *src, u32 numPixels); -void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels); -void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels); -void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels); +void ConvertBGRA8888ToRGBA5551(u16_le *dst, const u32_le *src, u32 numPixels); +void ConvertBGRA8888ToRGB565(u16_le *dst, const u32_le *src, u32 numPixels); +void ConvertBGRA8888ToRGBA4444(u16_le *dst, const u32_le *src, u32 numPixels); -void ConvertRGB565ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertRGBA5551ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertRGBA4444ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); +void ConvertRGB565ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGBA5551ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGBA4444ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); -void ConvertBGR565ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertABGR1555ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertABGR4444ToRGBA8888(u32 *dst, const u16 *src, u32 numPixels); +void ConvertBGR565ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertABGR1555ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertABGR4444ToRGBA8888(u32_le *dst, const u16_le *src, u32 numPixels); -void ConvertRGBA4444ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels); -void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels); +void ConvertRGBA4444ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGBA5551ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGB565ToBGRA8888(u32_le *dst, const u16_le *src, u32 numPixels); -void ConvertRGBA4444ToABGR4444Basic(u16 *dst, const u16 *src, u32 numPixels); -void ConvertRGBA5551ToABGR1555Basic(u16 *dst, const u16 *src, u32 numPixels); -void ConvertRGB565ToBGR565Basic(u16 *dst, const u16 *src, u32 numPixels); +void ConvertRGBA4444ToABGR4444Basic(u16_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGBA5551ToABGR1555Basic(u16_le *dst, const u16_le *src, u32 numPixels); +void ConvertRGB565ToBGR565Basic(u16_le *dst, const u16_le *src, u32 numPixels); #if PPSSPP_ARCH(ARM64) #define ConvertRGBA4444ToABGR4444 ConvertRGBA4444ToABGR4444NEON diff --git a/Common/CommonFuncs.h b/Common/CommonFuncs.h index 4e5adc8beb66..80d9887d4092 100644 --- a/Common/CommonFuncs.h +++ b/Common/CommonFuncs.h @@ -30,6 +30,10 @@ #if defined(_M_IX86) || defined(_M_X86) #define Crash() {asm ("int $3");} +#elif defined(__wiiu__) +#include +#include +#define Crash() do{printf("Crash@%s:%4d %s().\n", __FILE__, __LINE__, __FUNCTION__);fflush(stdout);*(int*)0=0;}while(0) #else #include #define Crash() {kill(getpid(), SIGINT);} diff --git a/Common/FakeEmitter.h b/Common/FakeEmitter.h index 6b827655c84b..77d634bc9bfd 100644 --- a/Common/FakeEmitter.h +++ b/Common/FakeEmitter.h @@ -24,6 +24,8 @@ #include #include "Common.h" +#include "CodeBlock.h" + // VCVT flags #define TO_FLOAT 0 @@ -167,7 +169,7 @@ class Operand2 Operand2(FakeReg base, ShiftType type, FakeReg shift) // RSR { Type = TYPE_RSR; - _assert_msg_(JIT, type != ST_RRX, "Invalid Operand2: RRX does not take a register shift amount"); + _assert_msg_(type != ST_RRX, "Invalid Operand2: RRX does not take a register shift amount"); IndexOrShift = shift; Shift = type; Value = base; @@ -179,29 +181,29 @@ class Operand2 switch (type) { case ST_LSL: - _assert_msg_(JIT, shift < 32, "Invalid Operand2: LSL %u", shift); + _assert_msg_(shift < 32, "Invalid Operand2: LSL %u", shift); break; case ST_LSR: - _assert_msg_(JIT, shift <= 32, "Invalid Operand2: LSR %u", shift); + _assert_msg_(shift <= 32, "Invalid Operand2: LSR %u", shift); if (!shift) type = ST_LSL; if (shift == 32) shift = 0; break; case ST_ASR: - _assert_msg_(JIT, shift < 32, "Invalid Operand2: ASR %u", shift); + _assert_msg_(shift < 32, "Invalid Operand2: ASR %u", shift); if (!shift) type = ST_LSL; if (shift == 32) shift = 0; break; case ST_ROR: - _assert_msg_(JIT, shift < 32, "Invalid Operand2: ROR %u", shift); + _assert_msg_(shift < 32, "Invalid Operand2: ROR %u", shift); if (!shift) type = ST_LSL; break; case ST_RRX: - _assert_msg_(JIT, shift == 0, "Invalid Operand2: RRX does not take an immediate shift amount"); + _assert_msg_(shift == 0, "Invalid Operand2: RRX does not take an immediate shift amount"); type = ST_ROR; break; } @@ -223,45 +225,45 @@ class Operand2 case TYPE_RSR: return RSR(); default: - _assert_msg_(JIT, false, "GetData with Invalid Type"); + _assert_msg_(false, "GetData with Invalid Type"); return 0; } } u32 IMMSR() // IMM shifted register { - _assert_msg_(JIT, Type == TYPE_IMMSREG, "IMMSR must be imm shifted register"); + _assert_msg_(Type == TYPE_IMMSREG, "IMMSR must be imm shifted register"); return ((IndexOrShift & 0x1f) << 7 | (Shift << 5) | Value); } u32 RSR() // Register shifted register { - _assert_msg_(JIT, Type == TYPE_RSR, "RSR must be RSR Of Course"); + _assert_msg_(Type == TYPE_RSR, "RSR must be RSR Of Course"); return (IndexOrShift << 8) | (Shift << 5) | 0x10 | Value; } u32 Rm() { - _assert_msg_(JIT, Type == TYPE_REG, "Rm must be with Reg"); + _assert_msg_(Type == TYPE_REG, "Rm must be with Reg"); return Value; } u32 Imm5() { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm5 not IMM value"); + _assert_msg_((Type == TYPE_IMM), "Imm5 not IMM value"); return ((Value & 0x0000001F) << 7); } u32 Imm8() { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm8Rot not IMM value"); + _assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value"); return Value & 0xFF; } u32 Imm8Rot() // IMM8 with Rotation { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm8Rot not IMM value"); - _assert_msg_(JIT, (Rotation & 0xE1) != 0, "Invalid Operand2: immediate rotation %u", Rotation); + _assert_msg_((Type == TYPE_IMM), "Imm8Rot not IMM value"); + _assert_msg_((Rotation & 0xE1) != 0, "Invalid Operand2: immediate rotation %u", Rotation); return (1 << 25) | (Rotation << 7) | (Value & 0x000000FF); } u32 Imm12() { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm12 not IMM"); + _assert_msg_((Type == TYPE_IMM), "Imm12 not IMM"); return (Value & 0x00000FFF); } @@ -272,12 +274,12 @@ class Operand2 // expand a 8bit IMM to a 32bit value and gives you some rotation as // well. // Each rotation rotates to the right by 2 bits - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm12Mod not IMM"); + _assert_msg_((Type == TYPE_IMM), "Imm12Mod not IMM"); return ((Rotation & 0xF) << 8) | (Value & 0xFF); } u32 Imm16() { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm16 not IMM"); + _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM"); return ( (Value & 0xF000) << 4) | (Value & 0x0FFF); } u32 Imm16Low() @@ -286,12 +288,12 @@ class Operand2 } u32 Imm16High() // Returns high 16bits { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm16 not IMM"); + _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM"); return ( ((Value >> 16) & 0xF000) << 4) | ((Value >> 16) & 0x0FFF); } u32 Imm24() { - _assert_msg_(JIT, (Type == TYPE_IMM), "Imm16 not IMM"); + _assert_msg_((Type == TYPE_IMM), "Imm16 not IMM"); return (Value & 0x0FFFFFFF); } }; @@ -380,11 +382,11 @@ class FakeXEmitter } virtual ~FakeXEmitter() {} - void SetCodePtr(u8 *ptr) {} + void SetCodePointer(u8 *ptr, u8 *writePtr) {} void ReserveCodeSpace(u32 bytes) {} const u8 *AlignCode16() { return nullptr; } const u8 *AlignCodePage() { return nullptr; } - const u8 *GetCodePtr() const { return nullptr; } + const u8 *GetCodePointer() const { return nullptr; } void FlushIcache() {} void FlushIcacheSection(u8 *start, u8 *end) {} u8 *GetWritableCodePtr() { return nullptr; } @@ -411,7 +413,7 @@ class FakeXEmitter // Everything that needs to generate machine code should inherit from this. // You get memory management for free, plus, you can use all the MOV etc functions without // having to prefix them with gen-> or something similar. -class FakeXCodeBlock : public FakeXEmitter +class FakeXCodeBlock : public CodeBlock { protected: u8 *region; @@ -421,12 +423,14 @@ class FakeXCodeBlock : public FakeXEmitter FakeXCodeBlock() : region(NULL), region_size(0) {} virtual ~FakeXCodeBlock() { if (region) FreeCodeSpace(); } + void PoisonMemory(int offset) override {} + // Call this before you generate any code. void AllocCodeSpace(int size) { } // Always clear code space with breakpoints, so that if someone accidentally executes // uninitialized, it just breaks into the debugger. - void ClearCodeSpace() { } + void ClearCodeSpace(int offset = 0) { } // Call this when shutting down. Don't rely on the destructor, even though it'll do the job. void FreeCodeSpace() { } diff --git a/Common/FileUtil.cpp b/Common/FileUtil.cpp index d18ccecad43a..a38441c08144 100644 --- a/Common/FileUtil.cpp +++ b/Common/FileUtil.cpp @@ -850,6 +850,8 @@ const std::string &GetExeDirectory() *(last_slash + 1) = '\0'; ExePath = program_path; } +#elif defined(__wiiu__) + ExePath = "sd:/ppsspp/"; #endif } diff --git a/Common/KeyMap.cpp b/Common/KeyMap.cpp index 19b93711df46..feb2aed85bdc 100644 --- a/Common/KeyMap.cpp +++ b/Common/KeyMap.cpp @@ -289,6 +289,29 @@ static const DefMappingStruct defaultXperiaPlay[] = { {VIRTKEY_AXIS_Y_MAX, JOYSTICK_AXIS_Y, +1}, }; +static const DefMappingStruct defaultWiiuGamepadKeyMap[] = { + {VIRTKEY_AXIS_X_MIN , JOYSTICK_AXIS_X, -1}, + {VIRTKEY_AXIS_X_MAX , JOYSTICK_AXIS_X, +1}, + {VIRTKEY_AXIS_Y_MIN , JOYSTICK_AXIS_Y, -1}, + {VIRTKEY_AXIS_Y_MAX , JOYSTICK_AXIS_Y, +1}, + {CTRL_CIRCLE , NKCODE_BUTTON_A}, + {CTRL_CROSS , NKCODE_BUTTON_B}, + {CTRL_TRIANGLE , NKCODE_BUTTON_X}, + {CTRL_SQUARE , NKCODE_BUTTON_Y}, + {CTRL_UP , NKCODE_DPAD_UP}, + {CTRL_RIGHT , NKCODE_DPAD_RIGHT}, + {CTRL_DOWN , NKCODE_DPAD_DOWN}, + {CTRL_LEFT , NKCODE_DPAD_LEFT}, + {CTRL_START , NKCODE_BUTTON_START}, + {CTRL_SELECT , NKCODE_BUTTON_SELECT}, + {CTRL_LTRIGGER , NKCODE_BUTTON_L1}, + {CTRL_RTRIGGER , NKCODE_BUTTON_R1}, + {VIRTKEY_UNTHROTTLE , JOYSTICK_AXIS_RTRIGGER, +1}, + {VIRTKEY_SPEED_TOGGLE , NKCODE_BUTTON_THUMBR}, + {VIRTKEY_PAUSE , JOYSTICK_AXIS_LTRIGGER, +1}, + {VIRTKEY_PAUSE , NKCODE_HOME}, +}; + void KeyCodesFromPspButton(int btn, std::vector *keycodes) { for (auto i = g_controllerMap[btn].begin(), end = g_controllerMap[btn].end(); i != end; ++i) { keycodes->push_back((keycode_t)i->keyCode); @@ -420,6 +443,9 @@ void SetDefaultKeyMap(DefaultMaps dmap, bool replace) { case DEFAULT_MAPPING_XPERIA_PLAY: SetDefaultKeyMap(DEVICE_ID_DEFAULT, defaultXperiaPlay, ARRAY_SIZE(defaultXperiaPlay), replace); break; + case DEFAULT_MAPPING_WIIU: + SetDefaultKeyMap(DEVICE_ID_PAD_0, defaultWiiuGamepadKeyMap, ARRAY_SIZE(defaultWiiuGamepadKeyMap), replace); + break; } UpdateNativeMenuKeys(); @@ -904,6 +930,8 @@ void RestoreDefault() { INFO_LOG(SYSTEM, "Default pad map"); SetDefaultKeyMap(DEFAULT_MAPPING_PAD, true); } +#elif defined(__wiiu__) + SetDefaultKeyMap(DEFAULT_MAPPING_WIIU, false); #else SetDefaultKeyMap(DEFAULT_MAPPING_KEYBOARD, true); SetDefaultKeyMap(DEFAULT_MAPPING_PAD, false); diff --git a/Common/KeyMap.h b/Common/KeyMap.h index 86883a9e9800..172e51f4b646 100644 --- a/Common/KeyMap.h +++ b/Common/KeyMap.h @@ -73,6 +73,7 @@ enum DefaultMaps { DEFAULT_MAPPING_OUYA, DEFAULT_MAPPING_XPERIA_PLAY, DEFAULT_MAPPING_MOQI_I7S, + DEFAULT_MAPPING_WIIU, }; const float AXIS_BIND_THRESHOLD = 0.75f; diff --git a/Common/MemArena.h b/Common/MemArena.h index 0c3b7945466b..5164f9ab2baa 100644 --- a/Common/MemArena.h +++ b/Common/MemArena.h @@ -52,6 +52,8 @@ class MemArena { #elif defined(__APPLE__) size_t vm_size; vm_address_t vm_mem; // same type as vm_address_t +#elif defined(__wiiu__) + u8 *memblock = nullptr; #else int fd; #endif diff --git a/Common/MemArenaPosix.cpp b/Common/MemArenaPosix.cpp index 15647b147a00..f35f9c1487e1 100644 --- a/Common/MemArenaPosix.cpp +++ b/Common/MemArenaPosix.cpp @@ -17,7 +17,7 @@ #include "ppsspp_config.h" -#if !defined(_WIN32) && !defined(ANDROID) && !defined(__APPLE__) +#if !defined(_WIN32) && !defined(ANDROID) && !defined(__APPLE__) && !defined(__wiiu__) #include diff --git a/Common/MemArenaWiiU.cpp b/Common/MemArenaWiiU.cpp new file mode 100644 index 000000000000..f00cc295adf6 --- /dev/null +++ b/Common/MemArenaWiiU.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "ppsspp_config.h" + +#ifdef __wiiu__ + +#include +#include +#include + +#include "FileUtil.h" +#include "MemoryUtil.h" +#include "MemArena.h" + + +size_t MemArena::roundup(size_t x) { return (x + (OS_MMAP_PAGE_SIZE - 1)) & ~(OS_MMAP_PAGE_SIZE - 1); } +void *rounddown(void *addr) { return (void *)((uintptr_t)addr & ~(OS_MMAP_PAGE_SIZE - 1)); } + +bool MemArena::NeedsProbing() { return false; } + +void MemArena::GrabLowMemSpace(size_t size) { + // TODO: this is unreliable as it could be fragmented. + memblock = (u8 *)MEM2_alloc(size, OS_MMAP_PAGE_SIZE); + DEBUG_VAR(memblock); + DEBUG_VAR(size); + DEBUG_VAR(OSEffectiveToPhysical(memblock)); +} + +void MemArena::ReleaseSpace() { + MEM2_free(memblock); + memblock = nullptr; +} + +void *MemArena::CreateView(s64 offset, size_t size, void *base) { + printf("View: offset:0x%08X size:0x%08X base:0x%08X ", (u32)offset, (u32)size, (u32)base); + // TODO: [oldbase, oldbase + oldsize] needs to be inside [newbase, newbase + newsize], then return oldbase + // this should work since there is no page collisions on the requested views. + size = roundup(size); + size_t diff = (u32)base & (OS_MMAP_PAGE_SIZE - 1); + base = OSAllocVirtAddr(rounddown(base), size, OS_MMAP_PAGE_SIZE); + printf("--> size:0x%08X base:0x%08X ", (u32)size, (u32)base); + if (!OSMapMemory(base, OSEffectiveToPhysical(memblock + offset), size, OS_MMAP_RW)) { + printf("--> 0x00000000\n"); + return nullptr; + } + printf("--> 0x%08X\n", (u32)base + (u32)diff); + + return (u8 *)base + diff; +} + +void MemArena::ReleaseView(void *view, size_t size) { + OSUnmapMemory(rounddown(view), roundup(size)); + OSFreeVirtAddr(rounddown(view), roundup(size)); +} + +u8 *MemArena::Find4GBBase() { + size_t size = 0x10000000; + void *base = OSAllocVirtAddr(nullptr, size, OS_MMAP_PAGE_SIZE); + _assert_msg_(base, "Failed to map 256 MB of memory space"); + OSFreeVirtAddr(base, size); + return (u8 *)base; +} + +#endif diff --git a/Common/MemoryUtil.cpp b/Common/MemoryUtil.cpp index c834444563bd..79e49da052fa 100644 --- a/Common/MemoryUtil.cpp +++ b/Common/MemoryUtil.cpp @@ -37,6 +37,13 @@ #include #endif +#ifdef __wiiu__ +#include +#include +#include +static MEMExpandedHeap* rwx_heap; +#endif + #ifndef _WIN32 #include #endif @@ -46,6 +53,8 @@ static int hint_location; #elif defined(_WIN32) static SYSTEM_INFO sys_info; #define MEM_PAGE_SIZE (uintptr_t)(sys_info.dwPageSize) +#elif defined(__wiiu__) +#define MEM_PAGE_SIZE OS_MMAP_PAGE_SIZE #else #define MEM_PAGE_SIZE (getpagesize()) #endif @@ -69,8 +78,7 @@ static uint32_t ConvertProtFlagsWin32(uint32_t flags) { } return protect; } - -#else +#elif !defined(__wiiu__) static uint32_t ConvertProtFlagsUnix(uint32_t flags) { uint32_t protect = 0; @@ -155,6 +163,11 @@ void *AllocateExecutableMemory(size_t size) { ptr = VirtualAlloc(0, size, MEM_RESERVE | MEM_COMMIT, prot); #endif } +#elif defined(__wiiu__) + if (!rwx_heap) { + rwx_heap = MEMCreateExpHeapEx((u32*)0x00802000, 0x01000000 - 0x00802000, MEM_HEAP_FLAG_ZERO_ALLOCATED | MEM_HEAP_FLAG_USE_LOCK); + } + void *ptr = MEMAllocFromExpHeapEx(rwx_heap, size, 0x100); #else static char *map_hint = 0; #if defined(_M_X64) @@ -177,7 +190,7 @@ void *AllocateExecutableMemory(size_t size) { void* ptr = mmap(map_hint, size, prot, MAP_ANON | MAP_PRIVATE, -1, 0); #endif /* defined(_WIN32) */ -#if !defined(_WIN32) +#if !defined(_WIN32) && !defined(__wiiu__) static const void *failed_result = MAP_FAILED; #else static const void *failed_result = nullptr; @@ -218,6 +231,8 @@ void *AllocateMemoryPages(size_t size, uint32_t memProtFlags) { ERROR_LOG(MEMMAP, "Failed to allocate raw memory pages"); return nullptr; } +#elif defined(__wiiu__) + void* ptr = malloc(size); #else uint32_t protect = ConvertProtFlagsUnix(memProtFlags); void *ptr = mmap(0, size, protect, MAP_ANON | MAP_PRIVATE, -1, 0); @@ -237,7 +252,7 @@ void *AllocateAlignedMemory(size_t size, size_t alignment) { void* ptr = _aligned_malloc(size,alignment); #else void* ptr = NULL; -#ifdef __ANDROID__ +#if defined(__ANDROID__) || defined(__wiiu__) ptr = memalign(alignment, size); #else if (posix_memalign(&ptr, alignment, size) != 0) { @@ -259,6 +274,12 @@ void FreeMemoryPages(void *ptr, size_t size) { if (!VirtualFree(ptr, 0, MEM_RELEASE)) { ERROR_LOG(MEMMAP, "FreeMemoryPages failed!\n%s", GetLastErrorMsg()); } +#elif defined(__wiiu__) + if ((u32)ptr < 0x01000000) { + MEMFreeToExpHeap(rwx_heap, ptr); + } else { + free(ptr); + } #else munmap(ptr, size); #endif @@ -313,6 +334,8 @@ bool ProtectMemoryPages(const void* ptr, size_t size, uint32_t memProtFlags) { } #endif return true; +#elif defined(__wiiu__) + return true; #else uint32_t protect = ConvertProtFlagsUnix(memProtFlags); uintptr_t page_size = GetMemoryProtectPageSize(); diff --git a/Common/MemoryUtil.h b/Common/MemoryUtil.h index 53c917b7b8eb..1bf70bbd21f6 100644 --- a/Common/MemoryUtil.h +++ b/Common/MemoryUtil.h @@ -17,9 +17,10 @@ #pragma once -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(__wiiu__) #include #endif +#include #include // Returns true if we need to avoid setting both writable and executable at the same time (W^X) diff --git a/Common/Misc.cpp b/Common/Misc.cpp index af573729b93c..0ca6090f10ab 100644 --- a/Common/Misc.cpp +++ b/Common/Misc.cpp @@ -20,7 +20,7 @@ #include -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(__wiiu__) #define __thread #endif diff --git a/Common/Serialize/SerializeFuncs.h b/Common/Serialize/SerializeFuncs.h index 3080749fa377..86dff2feed02 100644 --- a/Common/Serialize/SerializeFuncs.h +++ b/Common/Serialize/SerializeFuncs.h @@ -35,6 +35,8 @@ void Do(PointerWrap &p, tm &t); // Which also can be a problem, for example struct tm is non-POD on linux, for whatever reason... #ifdef _MSC_VER template::value, bool isPointer = std::is_pointer::value> +#elif defined (__BIG_ENDIAN__) // treat swapped types as pod. +template::value> #else template::value> #endif @@ -111,9 +113,9 @@ void Do(PointerWrap &p, std::vector &x, T &default_val) { DoVector(p, x, default_val); } -template -void Do(PointerWrap &p, swap_struct_t &x) { - T v = x.swap(); +template +void Do(PointerWrap &p, swap_t &x) { + T v = x; Do(p, v); x = v; } diff --git a/Common/Swap.h b/Common/Swap.h index 3a5d8ac027f6..2f06321472e7 100644 --- a/Common/Swap.h +++ b/Common/Swap.h @@ -18,569 +18,162 @@ #pragma once #include +#include +#include +#include "base/basictypes.h" #include "Common/CommonTypes.h" +#include "Common/Log.h" -// Android -#if defined(__ANDROID__) -#include - -#if _BYTE_ORDER == _LITTLE_ENDIAN && !defined(COMMON_LITTLE_ENDIAN) -#define COMMON_LITTLE_ENDIAN 1 -#elif _BYTE_ORDER == _BIG_ENDIAN && !defined(COMMON_BIG_ENDIAN) -#define COMMON_BIG_ENDIAN 1 -#endif - -// GCC 4.6+ -#elif __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) - -#if __BYTE_ORDER__ && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && !defined(COMMON_LITTLE_ENDIAN) -#define COMMON_LITTLE_ENDIAN 1 -#elif __BYTE_ORDER__ && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) && !defined(COMMON_BIG_ENDIAN) -#define COMMON_BIG_ENDIAN 1 -#endif - -// LLVM/clang -#elif __clang__ - -#if __LITTLE_ENDIAN__ && !defined(COMMON_LITTLE_ENDIAN) -#define COMMON_LITTLE_ENDIAN 1 -#elif __BIG_ENDIAN__ && !defined(COMMON_BIG_ENDIAN) -#define COMMON_BIG_ENDIAN 1 +#if !defined(__BIG_ENDIAN__) && defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define __BIG_ENDIAN__ 1 #endif -// MSVC -#elif defined(_MSC_VER) && !defined(COMMON_BIG_ENDIAN) && !defined(COMMON_LITTLE_ENDIAN) - -#define COMMON_LITTLE_ENDIAN 1 - +#if !defined(__BIG_ENDIAN__) && !defined(__LITTLE_ENDIAN__) +#define __LITTLE_ENDIAN__ 1 #endif -// Worst case, default to little endian. -#if !COMMON_BIG_ENDIAN && !COMMON_LITTLE_ENDIAN -#define COMMON_LITTLE_ENDIAN 1 -#endif - -#ifdef _MSC_VER -inline unsigned long long bswap64(unsigned long long x) { return _byteswap_uint64(x); } -inline unsigned int bswap32(unsigned int x) { return _byteswap_ulong(x); } -inline unsigned short bswap16(unsigned short x) { return _byteswap_ushort(x); } -#elif defined(__DragonFly__) || defined(__FreeBSD__) || \ - defined(__NetBSD__) || defined(__OpenBSD__) -#include -# ifdef __OpenBSD__ -#define bswap16 swap16 -#define bswap32 swap32 -#define bswap64 swap64 -# endif -#else -// TODO: speedup -inline unsigned short bswap16(unsigned short x) { return (x << 8) | (x >> 8); } -inline unsigned int bswap32(unsigned int x) { return (x >> 24) | ((x & 0xFF0000) >> 8) | ((x & 0xFF00) << 8) | (x << 24); } -inline unsigned long long bswap64(unsigned long long x) { return ((unsigned long long)bswap32(x) << 32) | bswap32(x >> 32); } -#endif - -inline float bswapf(float f) { - union { - float f; - unsigned int u32; - } dat1, dat2; - - dat1.f = f; - dat2.u32 = bswap32(dat1.u32); - - return dat2.f; -} - -inline double bswapd(double f) { - union { - double f; - unsigned long long u64; - } dat1, dat2; - - dat1.f = f; - dat2.u64 = bswap64(dat1.u64); - - return dat2.f; -} - -template -struct swap_struct_t { - typedef swap_struct_t swapped_t; - -protected: - T value; - - static T swap(T v) { - return F::swap(v); - } -public: - T const swap() const { - return swap(value); - } - swap_struct_t() : value((T)0) {} - swap_struct_t(const T &v): value(swap(v)) {} - - template - swapped_t& operator=(const S &source) { - value = swap((T)source); - return *this; - } - - operator unsigned long() const { return (unsigned long)swap(); } - operator long() const { return (long)swap(); } - operator s8() const { return (s8)swap(); } - operator u8() const { return (u8)swap(); } - operator s16() const { return (s16)swap(); } - operator u16() const { return (u16)swap(); } - operator s32() const { return (s32)swap(); } - operator u32() const { return (u32)swap(); } - operator s64() const { return (s64)swap(); } - operator u64() const { return (u64)swap(); } - operator float() const { return (float)swap(); } - operator double() const { return (double)swap(); } - - // +v - swapped_t operator +() const { - return +swap(); - } - // -v - swapped_t operator -() const { - return -swap(); - } - - // v / 5 - swapped_t operator/(const swapped_t &i) const { - return swap() / i.swap(); - } - template - swapped_t operator/(const S &i) const { - return swap() / i; - } - - // v * 5 - swapped_t operator*(const swapped_t &i) const { - return swap() * i.swap(); - } - template - swapped_t operator*(const S &i) const { - return swap() * i; - } - - // v + 5 - swapped_t operator+(const swapped_t &i) const { - return swap() + i.swap(); - } - template - swapped_t operator+(const S &i) const { - return swap() + (T)i; - } - // v - 5 - swapped_t operator-(const swapped_t &i) const { - return swap() - i.swap(); - } - template - swapped_t operator-(const S &i) const { - return swap() - (T)i; - } +template struct swap_t { + static_assert(std::is_scalar::value && (sizeof(T) > 1), "swap_t used with an invalid type"); - // v += 5 - swapped_t& operator+=(const swapped_t &i) { - value = swap(swap() + i.swap()); - return *this; - } - template - swapped_t& operator+=(const S &i) { - value = swap(swap() + (T)i); - return *this; - } - // v -= 5 - swapped_t& operator-=(const swapped_t &i) { - value = swap(swap() - i.swap()); - return *this; - } - template - swapped_t& operator-=(const S &i) { - value = swap(swap() - (T)i); - return *this; +private: + static T swap(T val) { + switch (sizeof(T)) { + case 2: *(u16 *)&val = swap16(*(u16 *)&val); break; + case 4: *(u32 *)&val = swap32(*(u32 *)&val); break; + case 8: *(u64 *)&val = swap64(*(u64 *)&val); break; + default: break; + } + return val; } - // ++v - swapped_t& operator++() { - value = swap(swap()+1); - return *this; - } - // --v - swapped_t& operator--() { - value = swap(swap()-1); - return *this; - } - - // v++ - swapped_t operator++(int) { - swapped_t old = *this; - value = swap(swap()+1); +public: + swap_t() {} + swap_t(T val) : swapped(swap(val)) {} + + swap_t &operator=(T val) { return *this = swap_t(val); } + + swap_t &operator&=(T val) { return *this = *this & val; } + swap_t &operator|=(T val) { return *this = *this | val; } + swap_t &operator^=(T val) { return *this = *this ^ val; } + swap_t &operator+=(T val) { return *this = *this + val; } + swap_t &operator-=(T val) { return *this = *this - val; } + swap_t &operator*=(T val) { return *this = *this * val; } + swap_t &operator/=(T val) { return *this = *this / val; } + swap_t &operator%=(T val) { return *this = *this % val; } + swap_t &operator<<=(T val) { return *this = *this << val; } + swap_t &operator>>=(T val) { return *this = *this >> val; } + swap_t &operator++() { return *this += 1; } + swap_t &operator--() { return *this -= 1; } + + T operator++(int) { + T old = *this; + *this += 1; return old; } - // v-- - swapped_t operator--(int) { - swapped_t old = *this; - value = swap(swap()-1); + + T operator--(int) { + T old = *this; + *this -= 1; return old; } - // Comparaison - // v == i - bool operator==(const swapped_t &i) const { - return swap() == i.swap(); - } - template - bool operator==(const S &i) const { - return swap() == i; - } - // v != i - bool operator!=(const swapped_t &i) const { - return swap() != i.swap(); - } - template - bool operator!=(const S &i) const { - return swap() != i; - } + operator T() const { return swap(swapped); } - // v > i - bool operator>(const swapped_t &i) const { - return swap() > i.swap(); - } - template - bool operator>(const S &i) const { - return swap() > i; - } +private: + T swapped; +}; - // v < i - bool operator<(const swapped_t &i) const { - return swap() < i.swap(); - } - template - bool operator<(const S &i) const { - return swap() < i; - } +#ifdef __LITTLE_ENDIAN__ +template using LEndian = T; +template using BEndian = swap_t; +#else +template using LEndian = swap_t; +template using BEndian = T; +#endif - // v >= i - bool operator>=(const swapped_t &i) const { - return swap() >= i.swap(); - } - template - bool operator>=(const S &i) const { - return swap() >= i; - } +typedef LEndian u16_le; +typedef LEndian u32_le; +typedef LEndian u64_le; - // v <= i - bool operator<=(const swapped_t &i) const { - return swap() <= i.swap(); - } - template - bool operator<=(const S &i) const { - return swap() <= i; - } +typedef LEndian s16_le; +typedef LEndian s32_le; +typedef LEndian s64_le; - // logical - swapped_t operator !() const { - return !swap(); - } - - bool operator ||(const swapped_t & b) const { - return swap() || b.swap(); - } - template - bool operator ||(const S & b) const { - return swap() || b; - } +typedef LEndian float_le; +typedef LEndian double_le; - // bitmath - swapped_t operator ~() const { - return ~swap(); - } +typedef BEndian u16_be; +typedef BEndian u32_be; +typedef BEndian u64_be; - swapped_t operator &(const swapped_t &b) const { - return swap() & b.swap(); - } - template - swapped_t operator &(const S &b) const { - return swap() & b; - } - swapped_t& operator &=(const swapped_t &b) { - value = swap(swap() & b.swap()); - return *this; - } - template - swapped_t& operator &=(const S b) { - value = swap(swap() & b); - return *this; - } +typedef BEndian s16_be; +typedef BEndian s32_be; +typedef BEndian s64_be; - swapped_t operator |(const swapped_t &b) const { - return swap() | b.swap(); - } - template - swapped_t operator |(const S &b) const { - return swap() | b; - } - swapped_t& operator |=(const swapped_t &b) { - value = swap(swap() | b.swap()); - return *this; - } - template - swapped_t& operator |=(const S &b) { - value = swap(swap() | b); - return *this; - } +typedef BEndian float_be; +typedef BEndian double_be; - swapped_t operator ^(const swapped_t &b) const { - return swap() ^ b.swap(); - } - template - swapped_t operator ^(const S &b) const { - return swap() ^ b; - } - swapped_t& operator ^=(const swapped_t &b) { - value = swap(swap() ^ b.swap()); - return *this; - } - template - swapped_t& operator ^=(const S &b) { - value = swap(swap() ^ b); - return *this; +template +static inline void ToLEndian(BEndian *ptr, size_t count) { + for (int i = 0; i < count; i++) { + ((LEndian*)ptr)[i] = ptr[i]; } +} - template - swapped_t operator <<(const S &b) const { - return swap() << b; - } - template - swapped_t& operator <<=(const S &b) const { - value = swap(swap() << b); - return *this; - } +template +static inline void ToLEndian(LEndian *ptr, size_t count) { + return; +} - template - swapped_t operator >>(const S &b) const { - return swap() >> b; - } - template - swapped_t& operator >>=(const S &b) const { - value = swap(swap() >> b); - return *this; +template +static inline void ToBEndian(LEndian *ptr, size_t count) { + for (int i = 0; i < count; i++) { + ((BEndian*)ptr)[i] = ptr[i]; } - - // Member - /** todo **/ - - - // Arithmetics - template - friend S operator+(const S &p, const swapped_t& v); - - template - friend S operator-(const S &p, const swapped_t& v); - - template - friend S operator/(const S &p, const swapped_t& v); - - template - friend S operator*(const S &p, const swapped_t& v); - - template - friend S operator%(const S &p, const swapped_t& v); - - // Arithmetics + assignements - template - friend S operator+=(const S &p, const swapped_t& v); - - template - friend S operator-=(const S &p, const swapped_t& v); - - // Bitmath - template - friend S operator&(const S &p, const swapped_t& v); - - // Comparison - template - friend bool operator<(const S &p, const swapped_t& v); - - template - friend bool operator>(const S &p, const swapped_t& v); - - template - friend bool operator<=(const S &p, const swapped_t& v); - - template - friend bool operator>=(const S &p, const swapped_t& v); - - template - friend bool operator!=(const S &p, const swapped_t& v); - - template - friend bool operator==(const S &p, const swapped_t& v); -}; - - -// Arithmetics -template -S operator+(const S &i, const swap_struct_t& v) { - return i + v.swap(); } -template -S operator-(const S &i, const swap_struct_t& v) { - return i - v.swap(); +template +static inline void ToBEndian(BEndian *ptr, size_t count) { + return; } -template -S operator/(const S &i, const swap_struct_t& v) { - return i / v.swap(); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, (T)v, args...); } -template -S operator*(const S &i, const swap_struct_t& v) { - return i * v.swap(); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, (T)v, args...); } -template -S operator%(const S &i, const swap_struct_t& v) { - return i % v.swap(); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, (T)v, args...); } -// Arithmetics + assignements -template -S &operator+=(S &i, const swap_struct_t& v) { - i += v.swap(); - return i; +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, T2 v2, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, v2, (T)v, args...); } -template -S &operator-=(S &i, const swap_struct_t& v) { - i -= v.swap(); - return i; +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, T2 v2, T3 v3, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, v2, v3, (T)v, args...); } -// Logical -template -S operator&(const S &i, const swap_struct_t& v) { - return i & v.swap(); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, v2, v3, v4, (T)v, args...); } -template -S operator&(const swap_struct_t& v, const S &i) { - return (S)(v.swap() & i); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, v2, v3, v4, v5, (T)v, args...); } - -// Comparaison -template -bool operator<(const S &p, const swap_struct_t& v) { - return p < v.swap(); -} -template -bool operator>(const S &p, const swap_struct_t& v) { - return p > v.swap(); -} -template -bool operator<=(const S &p, const swap_struct_t& v) { - return p <= v.swap(); -} -template -bool operator>=(const S &p, const swap_struct_t& v) { - return p >= v.swap(); -} -template -bool operator!=(const S &p, const swap_struct_t& v) { - return p != v.swap(); +template +static inline void GenericLog(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, const char *file, int line, const char *fmt, T0 v0, T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, swap_t v, Targs... args) { + GenericLog(level, type, file, line, fmt, v0, v1, v2, v3, v4, v5, v6, (T)v, args...); } -template -bool operator==(const S &p, const swap_struct_t& v) { - return p == v.swap(); -} - -template -struct swap_64_t { - static T swap(T x) { - return (T)bswap64(*(u64 *)&x); - } -}; - -template -struct swap_32_t { - static T swap(T x) { - return (T)bswap32(*(u32 *)&x); - } -}; - -template -struct swap_16_t { - static T swap(T x) { - return (T)bswap16(*(u16 *)&x); - } -}; - -template -struct swap_float_t { - static T swap(T x) { - return (T)bswapf(*(float *)&x); - } -}; - -template -struct swap_double_t { - static T swap(T x) { - return (T)bswapd(*(double *)&x); - } -}; - -#if COMMON_LITTLE_ENDIAN -typedef u32 u32_le; -typedef u16 u16_le; -typedef u64 u64_le; - -typedef s32 s32_le; -typedef s16 s16_le; -typedef s64 s64_le; - -typedef float float_le; -typedef double double_le; - -typedef swap_struct_t> u64_be; -typedef swap_struct_t> s64_be; - -typedef swap_struct_t> u32_be; -typedef swap_struct_t> s32_be; - -typedef swap_struct_t> u16_be; -typedef swap_struct_t> s16_be; - -typedef swap_struct_t > float_be; -typedef swap_struct_t > double_be; -#else - -typedef swap_struct_t> u64_le; -typedef swap_struct_t> s64_le; - -typedef swap_struct_t> u32_le; -typedef swap_struct_t> s32_le; - -typedef swap_struct_t> u16_le; -typedef swap_struct_t> s16_le; - -typedef swap_struct_t > float_le; -typedef swap_struct_t > double_le; - -typedef u32 u32_be; -typedef u16 u16_be; -typedef u64 u64_be; - -typedef s32 s32_be; -typedef s16 s16_be; -typedef s64 s64_be; - -typedef float float_be; -typedef double double_be; -#endif diff --git a/Common/TimeUtil.cpp b/Common/TimeUtil.cpp index 7a3f3b6c4611..7513c5d39bc0 100644 --- a/Common/TimeUtil.cpp +++ b/Common/TimeUtil.cpp @@ -5,6 +5,9 @@ #ifdef _WIN32 #include +#elif defined __wiiu__ +#include +#include #else #include #include @@ -36,7 +39,13 @@ double real_time_now() { double elapsed = static_cast(time.QuadPart - startTime.QuadPart); return elapsed * frequencyMult; } - +#elif defined(__wiiu__) +double real_time_now() { + static OSTime start; + if(!start) + start = OSGetSystemTime(); + return (double)(OSGetSystemTime() - start) * (1.0 / (double) wiiu_timer_clock); +} #else uint64_t _frequency = 0; diff --git a/Common/Vulkan/VulkanContext.cpp b/Common/Vulkan/VulkanContext.cpp index 2cb47926f014..2ff548d4dd0c 100644 --- a/Common/Vulkan/VulkanContext.cpp +++ b/Common/Vulkan/VulkanContext.cpp @@ -1185,6 +1185,7 @@ EShLanguage FindLanguage(const VkShaderStageFlagBits shader_type) { bool GLSLtoSPV(const VkShaderStageFlagBits shader_type, const char *pshader, std::vector &spirv, std::string *errorMessage) { +#ifndef __wiiu__ glslang::TProgram program; const char *shaderStrings[1]; @@ -1231,6 +1232,7 @@ bool GLSLtoSPV(const VkShaderStageFlagBits shader_type, options.optimizeSize = false; options.generateDebugInfo = false; glslang::GlslangToSpv(*program.getIntermediate(stage), spirv, &options); +#endif return true; } diff --git a/Common/ppcEmitter.cpp b/Common/ppcEmitter.cpp new file mode 100644 index 000000000000..fc76b50ac9a8 --- /dev/null +++ b/Common/ppcEmitter.cpp @@ -0,0 +1,988 @@ + +#include "ppsspp_config.h" + +#include "ppcEmitter.h" + +#ifdef __wiiu__ +#include +#endif + +#if !defined(DebugBreak) && !defined(_WIN32) +#ifdef __GNUC__ +#define DebugBreak() __builtin_trap() +#else +#define DebugBreak() Crash() +#endif +#endif + +// Helper + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | D | A | B | XO |Rc| +#define X_FORM(OPCD, D, A, B, XO, Rc) { \ + int a = (A), b = (B), d = (D); \ + Write32((OPCD << 26) | (d << 21) | (a << 16) | (b << 11) | (((XO) & 0x3ff) << 1) | (Rc)); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | D | A | B |OE| XO |Rc| +#define XO_FORM(OPCD, D, A, B, OE, XO, Rc) { \ + int a = (A), b = (B), d = (D); \ + Write32((OPCD << 26) | (d << 21) | (a << 16) | (b << 11) | (OE << 10) | (((XO) & 0x1ff) << 1) | (Rc)); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | D | A | B | C | XO |Rc| +#define A_FORM(OPCD, D, A, B, C, XO, Rc) { \ + int a = (A), b = (B), c = (C), d = (D); \ + Write32((OPCD << 26) | (d << 21) | (a << 16) | (b << 11) | (c << 6) | (XO << 1) | (Rc)); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | D | A | d/UIMM/SIMM | +#define D_FORM(OPCD, RD, RA, IMM) { \ + int _ra = (RA), _rd = (RD); \ + Write32((OPCD << 26) | (_rd << 21) | (_ra << 16) | ((IMM) & 0xffff)); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | D | A | B | C | XO |Rc| +#define A_FORM(OPCD, D, A, B, C, XO, Rc) { \ + int a = (A), b = (B), c = (C), d = (D); \ + Write32((OPCD << 26) | (d << 21) | (a << 16) | (b << 11) | (c << 6) | (XO << 1) | (Rc)); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | BO/crbD | BI/crbA | crbB | XO |LK| +#define XL_FORM(OPCD, crbD, crbA, crbB, XO, LK) { \ + X_FORM(OPCD, crbD, crbA, crbB, XO, LK); \ +} + +// 0 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +// | OPCD | S | A | SH | MB | ME |Rc| +#define M_FORM(OPCD, RS, RA, SH, MB, ME, Rc) { \ + int rs = (RS), ra = (RA), sh = (SH); \ + Write32((OPCD << 26) | (rs << 21) | (ra << 16) | (sh << 11) | ((MB) << 6) | ((ME) << 1) | (Rc)); \ +} + +namespace PpcGen { + + // Mul stuff + + void PPCXEmitter::DIVW (PPCReg Rt, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rt, Ra, Rb, 0, 491, 0); + } + void PPCXEmitter::DIVWU (PPCReg Rt, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rt, Ra, Rb, 0, 459, 0); + } + void PPCXEmitter::MULLW (PPCReg Rt, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rt, Ra, Rb, 0, 235, 0); + } + void PPCXEmitter::MULHW (PPCReg Rt, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rt, Ra, Rb, 0, 75, 0); + } + void PPCXEmitter::MULHWU(PPCReg Rt, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rt, Ra, Rb, 0, 11, 0); + } + + // Arithmetics ops + void PPCXEmitter::ADDZE (PPCReg Rd, PPCReg Ra) { + XO_FORM(31, Rd, Ra, 0, 0, 202, 0); + } + + void PPCXEmitter::ADD (PPCReg Rd, PPCReg Ra, PPCReg Rb) { + u32 instr = (0x7C000214 | (Rd << 21) | (Ra << 16) | (Rb << 11)); + Write32(instr); + } + + void PPCXEmitter::ADDI (PPCReg Rd, PPCReg Ra, short imm) { + u32 instr = (0x38000000 | (Rd << 21) | (Ra << 16) | ((imm) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::ADDIS (PPCReg Rd, PPCReg Ra, short imm) { + u32 instr = (0x3C000000 | (Rd << 21) | (Ra << 16) | ((imm) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::AND (PPCReg Rs, PPCReg Ra, PPCReg Rb) { + u32 instr = (0x7C000038 | (Ra << 21) | (Rs << 16) | (Rb << 11)); + Write32(instr); + } + + void PPCXEmitter::ANDI (PPCReg Rdest, PPCReg Ra, unsigned short imm) { + u32 instr = (0x70000000 | (Ra << 21) | (Rdest << 16) | ((imm) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::ANDIS (PPCReg Rdest, PPCReg Ra, unsigned short imm) { + u32 instr = (0x74000000 | (Ra << 21) | (Rdest << 16) | ((imm) & 0xffff)); + Write32(instr); + } + + // Memory load/store operations + void PPCXEmitter::LI(PPCReg dest, unsigned short imm) { + u32 instr = (0x38000000 | (dest << 21) | ((imm) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::LIS(PPCReg dest, unsigned short imm) { + u32 instr = (0x3C000000 | (dest << 21) | ((imm) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::LBZ (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0x88000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::LBZX (PPCReg dest, PPCReg a, PPCReg b) { + u32 instr = ((31<<26) | (dest << 21) | (a << 16) | (b << 11) | (87<<1)); + Write32(instr); + } + + void PPCXEmitter::LHZ (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0xA0000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::LHBRX (PPCReg dest, PPCReg src, PPCReg offset) { + u32 instr = (0x7C00062C | (dest << 21) | (src << 16) | (offset << 11)); + Write32(instr); + } + + void PPCXEmitter::LWZ (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0x80000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::LWBRX (PPCReg dest, PPCReg src, PPCReg offset) { + u32 instr = (0x7C00042C | (dest << 21) | (src << 16) | (offset << 11)); + Write32(instr); + } + + void PPCXEmitter::STB (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0x98000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::STBX (PPCReg dest, PPCReg a, PPCReg b) { + u32 instr = ((31<<26) | (dest << 21) | (a << 16) | (b << 11) | (215 << 1)); + Write32(instr); + } + + void PPCXEmitter::STH (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0xB0000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::STHBRX (PPCReg dest, PPCReg src, PPCReg offset) { + u32 instr = (0x7C00072C | (dest << 21) | (src << 16) | (offset << 11)); + Write32(instr); + } + + void PPCXEmitter::STW (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0x90000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::STWU (PPCReg dest, PPCReg src, int offset) { + u32 instr = (0x94000000 | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::STWBRX (PPCReg dest, PPCReg src, PPCReg offset) { + u32 instr = (0x7C00052C | (dest << 21) | (src << 16) | (offset << 11)); + Write32(instr); + } + +#if PPSSPP_ARCH(64BIT) + void PPCXEmitter::LD (PPCReg dest, PPCReg src, int offset) { + u32 instr = ((58 << 26) | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } + void PPCXEmitter::STD (PPCReg dest, PPCReg src, int offset) { + u32 instr = ((62 << 26) | (dest << 21) | (src << 16) | ((offset) & 0xffff)); + Write32(instr); + } +#endif + + // Branch operations + void PPCXEmitter::B (const void *fnptr) { + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x48000000 | ((s32)((func) & 0x3fffffc))); + Write32(instr); + } + + void PPCXEmitter::BL(const void *fnptr) { + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x48000001 | ((s32)((func) & 0x3fffffc))); + Write32(instr); + } + + void PPCXEmitter::BA (const void *fnptr) { + s32 func = (intptr_t)fnptr; + u32 instr = (0x48000002 | ((s32)((func) & 0x3fffffc))); + Write32(instr); + } + + void PPCXEmitter::BLA (const void *fnptr) { + s32 func = (intptr_t)fnptr; + u32 instr = (0x48000003 | ((s32)((func) & 0x3fffffc))); + Write32(instr); + } + + +#define IS_SMALL_JUMP (((uintptr_t)code - (uintptr_t)fnptr)>=-32767 && ((uintptr_t)code - (uintptr_t)fnptr)<=-32767) +#define CHECK_SMALL_JUMP { if(IS_SMALL_JUMP) { DebugBreak(); } } + + void PPCXEmitter::BEQ (const void *fnptr) { + CHECK_SMALL_JUMP + + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x41820000 | ( func & 0xfffc)); + Write32(instr); + } + + void PPCXEmitter::BNE (const void *fnptr) { + CHECK_SMALL_JUMP + + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x40820000 | ( func & 0xfffc)); + Write32(instr); + } + + void PPCXEmitter::BGT(const void *fnptr) { + CHECK_SMALL_JUMP + + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x41810000 | (((s16)(((func)+1))) & 0xfffc)); + Write32(instr); + } + + + void PPCXEmitter::BLTCTR() { + Write32((19 << 26) | (12 << 21) | (528 <<1)); + // Break(); + } + + void PPCXEmitter::BLT (const void *fnptr) { + //CHECK_JUMP + if (!IS_SMALL_JUMP) { + u32 func_addr = (uintptr_t) fnptr; + // Load func address + MOVI2R(R0, func_addr); + // Set it to link register + MTCTR(R0); + // Branch + BLTCTR(); + return; + } + + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x41800000 | (((s16)(((func)+1))) & 0xfffc)); + Write32(instr); + } + + void PPCXEmitter::BLE (const void *fnptr) { + CHECK_SMALL_JUMP + + s32 func = (intptr_t)fnptr - intptr_t(code); + u32 instr = (0x40810000 | (((s16)(((func)+1))) & 0xfffc)); + Write32(instr); + } + + void PPCXEmitter::BCTRL() { + Write32(0x4E800421); + } + + void PPCXEmitter::BCTR() { + Write32(0x4E800420); + } + + // Link Register + void PPCXEmitter::MFLR(PPCReg r) { + Write32(0x7C0802A6 | r << 21); + } + + void PPCXEmitter::MTLR(PPCReg r) { + Write32(0x7C0803A6 | r << 21); + } + + void PPCXEmitter::MTCTR(PPCReg r) { + Write32(0x7C0903A6 | r << 21); + } + + void PPCXEmitter::BLR() { + Write32(0x4E800020); + } + + void PPCXEmitter::BGTLR() { + Write32(0x4D810020); + } + + // Fixup + FixupBranch PPCXEmitter::B() + { + FixupBranch branch; + branch.type = _B; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + FixupBranch PPCXEmitter::BL() + { + FixupBranch branch; + branch.type = _BL; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + + FixupBranch PPCXEmitter::BNE() { + FixupBranch branch; + branch.type = _BNE; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + FixupBranch PPCXEmitter::BLT() { + FixupBranch branch; + branch.type = _BLT; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + FixupBranch PPCXEmitter::BLE() { + FixupBranch branch; + branch.type = _BLE; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + FixupBranch PPCXEmitter::B_Cond(FixupBranchType type) { + FixupBranch branch; + branch.type = type; + branch.ptr = code; + branch.condition = condition; + //We'll write NOP here for now. + Write32(0x60000000); + return branch; + } + + void PPCXEmitter::SetJumpTarget(FixupBranch const &branch) + { + s32 distance = intptr_t(code) - (intptr_t)branch.ptr; + _assert_msg_(distance > -32767 + && distance <= 32767, + "SetJumpTarget out of range (%p calls %p)", code, + branch.ptr); + + switch(branch.type) { + case _B: + *(u32*)branch.ptr = (0x48000000 | ((s32)((distance) & 0x3fffffc))); + break; + case _BL: + *(u32*)branch.ptr = (0x48000001 | ((s32)((distance) & 0x3fffffc))); + break; + case _BEQ: + *(u32*)branch.ptr = (0x41820000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + case _BNE: + *(u32*)branch.ptr = (0x40820000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + case _BLT: + *(u32*)branch.ptr = (0x41800000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + case _BLE: + *(u32*)branch.ptr = (0x40810000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + case _BGT: + *(u32*)branch.ptr = (0x41810000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + case _BGE: + *(u32*)branch.ptr = (0x40800000 | ((s16)(((distance)+1)) & 0xfffc)); + break; + default: + // Error !!! + _assert_msg_(0, "SetJumpTarget unknow branch type: %d", branch.type); + break; + } + } + + // Compare (Only use CR0 atm...) + void PPCXEmitter::CMPI(PPCReg dest, unsigned short imm) { + Write32((11<<26) | (dest << 16) | ((imm) & 0xffff)); + } + + void PPCXEmitter::CMPLI(PPCReg dest, unsigned short imm) { + Write32((10<<26) | (dest << 16) | ((imm) & 0xffff)); + } + + void PPCXEmitter::CMP(PPCReg a, PPCReg b, CONDITION_REGISTER cr) { + Write32((31 << 26) | (a << 16) | (b << 11)); + } + void PPCXEmitter::CMPL(PPCReg a, PPCReg b, CONDITION_REGISTER cr) { + Write32((31 << 26) | (a << 16) | (b << 11) | (1<<6)); + } + void PPCXEmitter::MFCR (PPCReg dest) { + Write32(0x7C000026 | (dest << 21)); + } + void PPCXEmitter::MTCR (PPCReg dest) { + Write32(0x7C000120 | (dest << 21) | (0xff<<12)); + } + + void PPCXEmitter::CROR (int bt, int ba, int bb) { + XL_FORM(19, bt, ba, bb, 449, 0); + } + + void PPCXEmitter::ISEL (PPCReg Rt, PPCReg Ra, PPCReg Rb, CONDITION_REGISTER cr) { + // Not working !! + A_FORM(31, Rt, Ra, Rb, cr, 15, 0); + Break(); + } + + // Others operation + void PPCXEmitter::ORI(PPCReg Rd, PPCReg Ra, unsigned short imm) { + u32 instr = (0x60000000 | (Ra << 21) | (Rd << 16) | (imm & 0xffff)); + Write32(instr); + } + void PPCXEmitter::XORI (PPCReg Rdest, PPCReg Ra, unsigned short imm) { + u32 instr = (0x68000000 | (Ra << 21) | (Rdest << 16) | (imm & 0xffff)); + Write32(instr); + } + + void PPCXEmitter::OR(PPCReg Rdest, PPCReg Ra, PPCReg Rb) { + u32 instr = (0x7C000378 | (Ra << 21) | (Rdest << 16) | (Rb << 11)); + Write32(instr); + } + + void PPCXEmitter::XOR(PPCReg Rd, PPCReg Ra, PPCReg Rb) { + u32 instr = (0x7C000278 | (Ra << 21) | (Rd << 16) | (Rb << 11)); + Write32(instr); + } + + void PPCXEmitter::NEG(PPCReg Rd, PPCReg Ra) { + XO_FORM(31, Rd, Ra, 0, 0, 104, 0); + } + + + void PPCXEmitter::NOR(PPCReg Rd, PPCReg Ra, PPCReg Rb) { + u32 instr = (0x7C0000f8 | (Ra << 21) | (Rd << 16) | (Rb << 11)); + Write32(instr); + } + + void PPCXEmitter::SUBF(PPCReg Rd, PPCReg Ra, PPCReg Rb, int RCFlags) { + u32 instr = (0x7C000050 | (Rd << 21) | (Ra << 16) | (Rb << 11) | (RCFlags & 1)); + Write32(instr); + } + + void PPCXEmitter::SUBFC (PPCReg Rd, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rd, Ra, Rb, 0, 8, 0); + } + void PPCXEmitter::SUBFIC(PPCReg Rt, PPCReg Ra, short imm) { + D_FORM(8, Rt, Ra, imm); + } + + + void PPCXEmitter::SUBFE(PPCReg Rd, PPCReg Ra, PPCReg Rb) { + XO_FORM(31, Rd, Ra, Rb, 0, 136, 0); + } + + // Quick Call + // dest = LIS(imm) + ORI(+imm) + void PPCXEmitter::MOVI2R(PPCReg dest, unsigned int imm) { + if ((s32) (s16) (imm) == (s32) (imm)) { + // 16bit + LI(dest, imm & 0xFFFF); + } else { + // HI 16bit + LIS(dest, imm>>16); + if ((imm & 0xFFFF) != 0) { + // LO 16bit + ORI(dest, dest, imm & 0xFFFF); + } + } + } + + void PPCXEmitter::QuickCallFunction(void *func) { + /** TODO : can use simple jump **/ + + u32 func_addr = (uintptr_t) func; + // Load func address + MOVI2R(R0, func_addr); + // Set it to link register + MTCTR(R0); + // Branch + BCTRL(); + } + + // sign + void PPCXEmitter::EXTSB (PPCReg dest, PPCReg src) { + Write32((0x7C000774 | (src << 21) | (dest << 16))); + } + + void PPCXEmitter::EXTSH (PPCReg dest, PPCReg src) { + Write32(0x7C000734 | (src << 21) | (dest << 16)); + } +#if PPSSPP_ARCH(64BIT) + void PPCXEmitter::EXTSW (PPCReg Rt, PPCReg Ra) { + X_FORM(31, Rt, Ra, 0, 986, 0); + } +#endif + void PPCXEmitter::EQV (PPCReg Ra, PPCReg Rs, PPCReg Rb) { + X_FORM(31, Rs, Ra, Rb, 284, 0); + } + + void PPCXEmitter::RLWINM (PPCReg dest, PPCReg src, int shift, int start, int end) { + Write32((21<<26) | (src << 21) | (dest << 16) | (shift << 11) | (start << 6) | (end << 1)); + } + + void PPCXEmitter::RLDICL (PPCReg Rs, PPCReg Ra, int sh, int mb) { + Write32((30 << 26) | (Rs << 21) | (Ra << 16) | (sh << 11) | ((mb) << 6) | ((sh) << 1) | (0)); + } + + // Shift Instructions + void PPCXEmitter::SRAW (PPCReg dest, PPCReg src, PPCReg shift) { + X_FORM(31, src, dest, shift, 792, 0); + } + void PPCXEmitter::SRAWI (PPCReg dest, PPCReg src, unsigned short imm) { + X_FORM(31, src, dest, imm, 824, 0); + } + + void PPCXEmitter::SLW (PPCReg dest, PPCReg src, PPCReg shift) { + X_FORM(31, src, dest, shift, 24, 0); + } + + void PPCXEmitter::SLWI (PPCReg dest, PPCReg src, unsigned short imm) { + RLWINM(dest, src, imm, 0, (31-imm)); + } + + void PPCXEmitter::SRW (PPCReg dest, PPCReg src, PPCReg shift) { + X_FORM(31, src, dest, shift, 536, 0); + } + + void PPCXEmitter::SRWI (PPCReg dest, PPCReg src, unsigned short imm) { + RLWINM(dest, src, (32-imm), imm, 31); + } + + void PPCXEmitter::ROTRW (PPCReg dest, PPCReg src, PPCReg shift) { + + } + + void PPCXEmitter::ROTRWI(PPCReg dest, PPCReg src, unsigned short imm) { + RLWINM(dest, src, (32-imm), 0, 31); + } + + void PPCXEmitter::ROTLW (PPCReg dest, PPCReg src, PPCReg shift) { + } + + void PPCXEmitter::ROTLWI (PPCReg dest, PPCReg src, unsigned short imm) { + } + + // Fpu + void PPCXEmitter::LFS (PPCReg FRt, PPCReg Ra, unsigned short offset) { + D_FORM(48, FRt, Ra, offset); + } + void PPCXEmitter::LFD (PPCReg FRt, PPCReg Ra, unsigned short offset) { + D_FORM(50, FRt, Ra, offset); + } + void PPCXEmitter::SFS (PPCReg FRt, PPCReg Ra, unsigned short offset) { + D_FORM(52, FRt, Ra, offset); + } + void PPCXEmitter::SFD (PPCReg FRt, PPCReg Ra, unsigned short offset) { + D_FORM(54, FRt, Ra, offset); + } + + + void PPCXEmitter::MOVI2F (PPCReg dest, float imm, bool negate) { + static u32 tmp; + + union convert { + unsigned int i; + float f; + } fc; + + fc.f = imm; + + MOVI2R(R6, fc.i); + + // R7 = imm + MOVI2R(R7, (uintptr_t)&tmp); + STW(R6, R7); + + // dest = R7 + LFS(dest, R7, 0); + + if (negate == true) { + FNEG(dest, dest); + } + } + + void PPCXEmitter::SaveFloatSwap(PPCReg FRt, PPCReg Base, PPCReg offset) { + // used for swapping float ... + // TODO: maybe just use the stack ? + static u32 tmp; + + // Save Value in tmp + MOVI2R(R7, (uintptr_t)&tmp); + SFS(FRt, R7, 0); + + // Load the value in R6 + LWZ(R6, R7); + + // Save the final value + STWBRX(R6, Base, offset); + } + + void PPCXEmitter::LoadFloatSwap(PPCReg FRt, PPCReg Base, PPCReg offset) { + // used for swapping float ... + static u32 tmp; + + // Load Value into a temp REG + LWBRX(R6, Base, offset); + + // Save it in tmp + MOVI2R(R7, (uintptr_t)&tmp); + STW(R6, R7); + + // Load the final value + LFS(FRt, R7, 0); + } + void PPCXEmitter::MTFSB0(int bt) { + X_FORM(63, bt, 0, 0, 70, 0); + } +#if PPSSPP_ARCH(64BIT) + void PPCXEmitter::FCTID (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 846, 0); + } + void PPCXEmitter::FCFID (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 846, 0); + } +#endif + void PPCXEmitter::FRSP (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 12, 0); + } + void PPCXEmitter::FCTIW (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 14, 0); + } + void PPCXEmitter::STFIWX(PPCReg FRt, PPCReg FRa, PPCReg FRb) { + X_FORM(31, FRt, FRa, FRb, 983, 0); + } + + // Fpu move instruction + void PPCXEmitter::FMR (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 72, 0); + } + void PPCXEmitter::FNEG (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 40, 0); + } + void PPCXEmitter::FABS (PPCReg FRt, PPCReg FRb) { + X_FORM(63, FRt, 0, FRb, 264, 0); + } + void PPCXEmitter::FNABS (PPCReg FRt, PPCReg FRb) { + Break(); + X_FORM(63, FRt, 0, FRb, 136, 0); + } + void PPCXEmitter::FCPSGN (PPCReg FRt, PPCReg FRb) { + Break(); + X_FORM(63, FRt, 0, FRb, 8, 0); + } + + // Fpu arith + void PPCXEmitter::FADDS (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(59, FRt, FRa, FRb, 0, 21, 0); + } + void PPCXEmitter::FSUBS (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(59, FRt, FRa, FRb, 0, 20, 0); + } + void PPCXEmitter::FADD (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, 0, 21, 0); + } + void PPCXEmitter::FSUB (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, 0, 20, 0); + } + void PPCXEmitter::FMUL (PPCReg FRt, PPCReg FRa, PPCReg FRc) { + A_FORM(63, FRt, FRa, 0, FRc, 25, 0); + } + void PPCXEmitter::FMULS (PPCReg FRt, PPCReg FRa, PPCReg FRc) { + A_FORM(59, FRt, FRa, 0, FRc, 25, 0); + } + void PPCXEmitter::FDIV (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, 0, 18, 0); + } + void PPCXEmitter::FDIVS (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + A_FORM(59, FRt, FRa, FRb, 0, 18, 0); + } +#if !PPSSPP_ARCH(PPC750) + void PPCXEmitter::FSQRT (PPCReg FRt, PPCReg FRb) { + A_FORM(63, FRt, 0, FRb, 0, 22, 0); + } + void PPCXEmitter::FSQRTS (PPCReg FRt, PPCReg FRb) { + A_FORM(59, FRt, 0, FRb, 0, 22, 0); + } + void PPCXEmitter::FSQRTE (PPCReg FRt, PPCReg FRb) { + Break(); + } + void PPCXEmitter::FSQRTES(PPCReg FRt, PPCReg FRb) { + Break(); + } +#endif + void PPCXEmitter::FRE (PPCReg FRt, PPCReg FRb) { + Break(); + } + void PPCXEmitter::FRES (PPCReg FRt, PPCReg FRb) { + Break(); + } + + // Fpu mul add + void PPCXEmitter::FMADD (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, FRc, 29, 0); + } + void PPCXEmitter::FMSUB (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, FRc, 28, 0); + } + void PPCXEmitter::FMADDS (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb) { + A_FORM(59, FRt, FRa, FRb, FRc, 29, 0); + } + void PPCXEmitter::FMSUBS (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb) { + A_FORM(59, FRt, FRa, FRb, FRc, 28, 0); + } + + // Fpu sel + void PPCXEmitter::FSEL (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb) { + A_FORM(63, FRt, FRa, FRb, FRc, 23, 0); + } + // #define fpmin(a,b) __fsel((a)-(b), b,a) + void PPCXEmitter::FMIN (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + PPCReg safe = FPR3; // hope it's safe !! + FSUBS(safe, FRa, FRb); + FSEL(FRt, safe, FRb, FRa); + //Break(); + } + // #define fpmax(a,b) __fsel((a)-(b), a,b) + void PPCXEmitter::FMAX (PPCReg FRt, PPCReg FRa, PPCReg FRb) { + PPCReg safe = FPR3; // hope it's safe !! + FSUBS(safe, FRa, FRb); + FSEL(FRt, safe, FRa, FRb); + //Break(); + } + + + + void PPCXEmitter::FCMPU (int Bf, PPCReg FRa, PPCReg FRb) { // unordered + X_FORM(63, Bf, FRa, FRb, 0, 0); + } + + void PPCXEmitter::FCMPO (int Bf, PPCReg FRa, PPCReg FRb) { // ordered + X_FORM(63, Bf, FRa, FRb, 32, 0); + } + + // fpu convert + void PPCXEmitter::FRIN (PPCReg FRt, PPCReg FRb) { // round + X_FORM(63, FRt, 0, FRb, 392, 0); + } + void PPCXEmitter::FRIZ (PPCReg FRt, PPCReg FRb) { // trunc + X_FORM(63, FRt, 0, FRb, 456, 0); + } + void PPCXEmitter::FRIP (PPCReg FRt, PPCReg FRb) { // ceil + X_FORM(63, FRt, 0, FRb, 424, 0); + } + void PPCXEmitter::FRIM (PPCReg FRt, PPCReg FRb) { // floor + X_FORM(63, FRt, 0, FRb, 488, 0); + } + + // Prologue / epilogue + + /** save/load fpr in a static buffer ... **/ + static double _fprTmp[32]; + + void PPCXEmitter::Prologue() { + // Save regs +#if PPSSPP_ARCH(32BIT) + u32 regSize = 4; +#else + u32 regSize = 8; +#endif + u32 stackFrameSize = 0x1F0; + + // Write Prologue (setup stack frame etc ...) + // Save Lr + MFLR(R12); + + // Save gpr + for(int i = 14; i < 32; i ++) { +#if PPSSPP_ARCH(32BIT) + STW((PPCReg)i, R1, -((33 - i) * regSize)); +#else + STD((PPCReg)i, R1, -((33 - i) * regSize)); +#endif + } + + // Save r12 + STW(R12, R1, -regSize); +#if 0 + // add fpr frame + ADDI(R12, R1, -0x98); + + // Load fpr + for(int i = 14; i < 32; i ++) { + SFD((PPCReg)i, R1, -((32 - i) * sizeof(double))); + } +#endif + // allocate stack + STWU(R1, R1, -stackFrameSize); + +#if 1 + // load fpr buff + MOVI2R(R5, (uintptr_t)&_fprTmp); + + // Save fpr + for(int i = 14; i < 32; i ++) { + SFD((PPCReg)i, R5, i * sizeof(double)); + } +#endif + } + + void PPCXEmitter::Epilogue() { +#if PPSSPP_ARCH(32BIT) + u32 regSize = 4; +#else + u32 regSize = 8; +#endif + u32 stackFrameSize = 0x1F0; + + //Break(); + + // Write Epilogue (restore stack frame, return) + // free stack + ADDI(R1, R1, stackFrameSize); +#if 0 + ADDI(R12, R1, -0x98); + + // Restore fpr + for(int i = 14; i < 32; i ++) { + LFD((PPCReg)i, R1, -((32 - i) * sizeof(double))); + } +#endif + // Restore gpr + for(int i = 14; i < 32; i ++) { +#if PPSSPP_ARCH(32BIT) + LWZ((PPCReg)i, R1, -((33 - i) * regSize)); +#else + LD((PPCReg)i, R1, -((33 - i) * regSize)); +#endif + } + + // recover r12 (LR saved register) + LWZ (R12, R1, -regSize); + + // Restore Lr + MTLR(R12); + +#if 1 + // load fpr buff + MOVI2R(R5, (uintptr_t)&_fprTmp); + + // Load fpr + for(int i = 14; i < 32; i ++) { + LFD((PPCReg)i, R5, i * sizeof(double)); + } +#endif + } + + // Others ... + void PPCXEmitter::SetCodePointer(u8 *ptr, u8 *writePtr) + { + code = ptr; + startcode = code; + lastCacheFlushEnd = ptr; + } + + const u8 *PPCXEmitter::GetCodePointer() const + { + return code; + } + + u8 *PPCXEmitter::GetWritableCodePtr() + { + return code; + } + + void PPCXEmitter::ReserveCodeSpace(u32 bytes) + { + for (u32 i = 0; i < bytes/4; i++) + Write32(0x60000000); //nop + } + + const u8 *PPCXEmitter::AlignCode16() + { + ReserveCodeSpace((-(intptr_t)code) & 15); + return code; + } + + const u8 *PPCXEmitter::AlignCodePage() + { + ReserveCodeSpace((-(intptr_t)code) & 4095); + return code; + } + + void PPCXEmitter::FlushIcache() + { + FlushIcacheSection(lastCacheFlushEnd, code); + lastCacheFlushEnd = code; + } + + void PPCXEmitter::FlushIcacheSection(u8 *start, u8 *end) + { +#if PPSSPP_ARCH(POWERPC) +#ifdef __wiiu__ + DCStoreRange(start, end - start); + ICInvalidateRange(start, end - start); +#if 0 + DisassemblePPCRange(start, end, (void*)printf, (void*)OSGetSymbolName, 0); + fflush(stdout); +#endif +#elif defined(__GNUC__) + u8 * addr = start; + while(addr < end) { + __asm__ volatile ("dcbst 0, %0" : : "r"(addr) : "0", "memory"); + __asm__ volatile ("icbi 0, %0" : : "r"(addr) : "0", "memory"); + addr += 4; + } + __asm__ volatile (".long 0x7c0004ac");//sync + __asm__ volatile (".long 0x4C00012C");//isync +#else + while(addr < end) { + __asm dcbst r0, addr + __asm icbi r0, addr + addr += 4; + } + __emit(0x7c0004ac);//sync + __emit(0x4C00012C);//isync +#endif +#endif + } + + // Always clear code space with breakpoints, so that if someone accidentally executes + // uninitialized, it just breaks into the debugger. + void PPCXCodeBlock::PoisonMemory(int offset) { + memset(region + offset, 0x00, region_size - offset); + } + +} // namespace diff --git a/Common/ppcEmitter.h b/Common/ppcEmitter.h new file mode 100644 index 000000000000..0605b143efa4 --- /dev/null +++ b/Common/ppcEmitter.h @@ -0,0 +1,468 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// http://www.csd.uwo.ca/~mburrel/stuff/ppc-asm.html +// http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.aixassem/doc/alangref/linkage_convent.htm +// http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.aixassem/doc/alangref/instruction_set.htm + +#pragma once + +#include "Common.h" +#include "CodeBlock.h" +#include "MemoryUtil.h" +#include +#include + +#undef _IP +#undef R0 +#undef _SP +#undef _LR +#undef _PC +#undef CALL +#undef _B + +namespace PpcGen +{ + enum PPCReg + { + // GPRs (32) + // Behaves as zero does in some instructions + R0 = 0, + // Stack pointer (SP) + R1, + // Reserved + R2, + // Used to pass integer function parameters and return values + R3, R4, + // Used to pass integer function parameters + R5, R6, R7, R8, R9, R10, + // General purpose + R11, + // Scratch + R12, + // Unused by the compiler reserved + R13, + // General purpose + R14, R15, R16, R17, R18, R19, + R20, R21, R22, R23, R24, R25, + R26, R27, R28, R29, R30, R31, + + // CRs (7) + CR0 = 0, + + // FPRs (32) + // Scratch + FPR0 = 0, + // Used to pass double word function parameters and return values + FPR1, FPR2, FPR3, FPR4, + FPR5, FPR6, FPR7, FPR8, + FPR9, FPR10, FPR11, FPR12, + FPR13, + // General purpose + FPR14, FPR15, FPR16, FPR17, + FPR18, FPR19, FPR20, FPR21, + FPR22, FPR23, FPR24, FPR25, + FPR26, FPR27, FPR28, FPR29, + FPR30, FPR31, + + + // Vmx (128) + VR0 = 0, VR1, VR2, VR3, VR4, + VR5, VR6, VR7, VR8, VR9, + VR10, VR11, VR12, VR13, VR14, + VR15, VR16, VR17, VR18, VR19, + VR20, VR21, VR22, VR23, VR24, + VR25, VR26, VR27, VR28, VR29, + VR30, VR31, VR32, VR33, VR34, + VR35, VR36, VR37, VR38, VR39, + VR40, VR41, VR42, VR43, VR44, + VR45, VR46, VR47, VR48, VR49, + VR50, VR51, VR52, VR53, VR54, + VR55, VR56, VR57, VR58, VR59, + VR60, VR61, VR62, VR63, VR64, + VR65, VR66, VR67, VR68, VR69, + VR70, VR71, VR72, VR73, VR74, + VR75, VR76, VR77, VR78, VR79, + VR80, VR81, VR82, VR83, VR84, + VR85, VR86, VR87, VR88, VR89, + VR90, VR91, VR92, VR93, VR94, + VR95, VR96, VR97, VR98, VR99, //... + + // Others regs + LR, CTR, XER, FPSCR, + + // End + + INVALID_REG = 0xFFFFFFFF + }; + enum IntegerSize + { + I_I8 = 0, + I_I16, + I_I32, + I_I64 + }; + + enum + { + NUMGPRs = 31, + }; + + typedef const u8* JumpTarget; + + + enum FixupBranchType { + _B, + _BEQ, + _BNE, + _BLT, + _BLE, + _BGT, + _BGE, + // Link register + _BL + }; + + struct FixupBranch + { + u8 *ptr; + u32 condition; // Remembers our codition at the time + FixupBranchType type; //0 = B 1 = BL + }; + + class PPCXEmitter + { + private: + u8 *code, *startcode; + u8 *lastCacheFlushEnd; + u32 condition; + + protected: + // Write opcode + inline void Write32(u32 value) {*(u32*)code = value; code+=4;} + public: + PPCXEmitter() : code(0), startcode(0), lastCacheFlushEnd(0) { + } + PPCXEmitter(u8 *code_ptr) { + code = code_ptr; + lastCacheFlushEnd = code_ptr; + startcode = code_ptr; + } + virtual ~PPCXEmitter() {} + + void SetCodePointer(u8 *ptr, u8 *writePtr); + void ReserveCodeSpace(u32 bytes); + const u8 *AlignCode16(); + const u8 *AlignCodePage(); + const u8 *GetCodePointer() const; + void FlushIcache(); + void FlushIcacheSection(u8 *start, u8 *end); + u8 *GetWritableCodePtr(); + + + // Special purpose instructions + + // Debug Breakpoint + void BKPT(u16 arg); + + // Hint instruction + void YIELD(); + + // Do nothing + void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals) + + // FixupBranch ops + FixupBranch B(); + FixupBranch BL(); + FixupBranch BNE(); + FixupBranch BLT(); + FixupBranch BLE(); + + FixupBranch B_Cond(FixupBranchType type); + + void SetJumpTarget(FixupBranch const &branch); + + // Branch ops + void B (const void *fnptr); + void BL(const void *fnptr); + void BA (const void *fnptr); + void BLA(const void *fnptr); + void BEQ(const void *fnptr); + void BNE(const void *fnptr); + void BLE(const void *fnptr); + void BLT(const void *fnptr); + void BGT(const void *fnptr); + void BEQ (PPCReg r); + + void BLR(); + void BGTLR(); // ??? used ? + void BLTCTR(); + void BGTCTR(); + void BLECTR(); + void BGECTR(); + void BCTRL (); + void BCTR(); + + // Link Register + void MFLR(PPCReg r); + void MTLR(PPCReg r); + void MTCTR(PPCReg r); + + + // Logical Ops + void AND (PPCReg Rs, PPCReg Ra, PPCReg Rb); + void ANDI (PPCReg Rdest, PPCReg Ra, unsigned short imm); + void ANDIS(PPCReg Rdest, PPCReg Ra, unsigned short imm); + void NAND (PPCReg Rs, PPCReg Ra, PPCReg Rb); + void OR (PPCReg Rs, PPCReg Ra, PPCReg Rb); + void ORI (PPCReg Rdest, PPCReg Ra, unsigned short imm); + void NOR (PPCReg Rs, PPCReg Ra, PPCReg Rb); + void XOR (PPCReg Rs, PPCReg Ra, PPCReg Rb); + void XORI (PPCReg Rdest, PPCReg Ra, unsigned short imm); + void NEG (PPCReg Rs, PPCReg Ra); + void EQV (PPCReg a, PPCReg b, PPCReg c); + + // Arithmetics ops + void ADD (PPCReg Rd, PPCReg Ra, PPCReg Rb); + void ADDI (PPCReg Rd, PPCReg Ra, short imm); + void ADDIS (PPCReg Rd, PPCReg Ra, short imm); + void ADDC (PPCReg Rd, PPCReg Ra, PPCReg Rb); + void ADDZE (PPCReg Rd, PPCReg Ra); + void SUB (PPCReg Rd, PPCReg Ra, PPCReg Rb) { + // reverse ? + SUBF(Rd, Rb, Ra); + } + // if RCFlags update CR0 + void SUBF (PPCReg Rd, PPCReg Ra, PPCReg Rb, int RCFlags = 0); + void SUBFIC (PPCReg Rt, PPCReg Ra, short imm); + void SUBFC (PPCReg Rd, PPCReg Ra, PPCReg Rb); + void SUBFE (PPCReg Rd, PPCReg Ra, PPCReg Rb); + + // integer multiplication ops + void DIVW (PPCReg Rt, PPCReg Ra, PPCReg Rb); + void DIVWU (PPCReg Rt, PPCReg Ra, PPCReg Rb); + void MULLW (PPCReg Rt, PPCReg Ra, PPCReg Rb); + void MULHW (PPCReg Rt, PPCReg Ra, PPCReg Rb); + void MULHWU (PPCReg Rt, PPCReg Ra, PPCReg Rb); + + // Memory load/store operations + void LI (PPCReg dest, unsigned short imm); + void LIS (PPCReg dest, unsigned short imm); + // dest = LIS(imm) + ORI(+imm) + void MOVI2R (PPCReg dest, unsigned int imm); + + // 8bit + void LBZ (PPCReg dest, PPCReg src, int offset = 0); + void LBZX (PPCReg dest, PPCReg a, PPCReg b); + + // 16bit + void LHZ (PPCReg dest, PPCReg src, int offset = 0); + void LHBRX (PPCReg dest, PPCReg src, PPCReg offset); + // 32 bit + void LWZ (PPCReg dest, PPCReg src, int offset = 0); + void LWBRX (PPCReg dest, PPCReg src, PPCReg offset); +#if PPSSPP_ARCH(64BIT) + // 64 bit + void LD (PPCReg dest, PPCReg src, int offset = 0); +#endif + + // 8 bit + void STB (PPCReg dest, PPCReg src, int offset = 0); + void STBX (PPCReg dest, PPCReg a, PPCReg b); + // 16 bit + void STH (PPCReg dest, PPCReg src, int offset = 0); + void STHBRX (PPCReg dest, PPCReg src, PPCReg offset); + // 32 bit + void STW (PPCReg dest, PPCReg src, int offset = 0); + void STWU (PPCReg dest, PPCReg src, int offset = 0); + void STWBRX (PPCReg dest, PPCReg src, PPCReg offset); +#if PPSSPP_ARCH(64BIT) + // 64 bit + void STD (PPCReg dest, PPCReg src, int offset = 0); +#endif + + // sign + void EXTSB (PPCReg dest, PPCReg src); + void EXTSH (PPCReg dest, PPCReg src); +#if PPSSPP_ARCH(64BIT) + void EXTSW (PPCReg dest, PPCReg src); +#endif + + // + void RLWINM (PPCReg dest, PPCReg src, int shift, int start, int end); + + void RLDICL (PPCReg Rt, PPCReg Rs, int sh, int mb); + + // Shift Instructions + void SRAW (PPCReg dest, PPCReg src, PPCReg shift); + void SRAWI (PPCReg dest, PPCReg src, unsigned short imm); + + void SLW (PPCReg dest, PPCReg src, PPCReg shift); + void SLWI (PPCReg dest, PPCReg src, unsigned short imm); + + void SRW (PPCReg dest, PPCReg src, PPCReg shift); + void SRWI (PPCReg dest, PPCReg src, unsigned short imm); + + void ROTRW (PPCReg dest, PPCReg src, PPCReg shift); + void ROTRWI (PPCReg dest, PPCReg src, unsigned short imm); + + void ROTLW (PPCReg dest, PPCReg src, PPCReg shift); + void ROTLWI (PPCReg dest, PPCReg src, unsigned short imm); + + // Compare + enum CONDITION_REGISTER{ + CR0, + CR1, + CR2, + CR3, + CR4, + CR5, + CR6, + CR7 + }; + + void CROR (int bt, int ba, int bb); + void CMPLI (PPCReg dest, unsigned short imm); + void CMPI (PPCReg dest, unsigned short imm); + void CMPL (PPCReg a, PPCReg b, CONDITION_REGISTER cr = CR0); + void CMP (PPCReg a, PPCReg b, CONDITION_REGISTER cr = CR0); + void MFCR (PPCReg dest); + void MTCR (PPCReg dest); + + void ISEL (PPCReg Rt, PPCReg Ra, PPCReg Rb, CONDITION_REGISTER cr = CR0); + + void Prologue(); + void Epilogue(); + + // Debug ! + void Break() { + Write32(0x0FE00016); + } + + void MR (PPCReg to, PPCReg from) { + OR(to, from, from); + } + + // Fpu + void LFS (PPCReg FRt, PPCReg Ra, unsigned short offset = 0); + void LFD (PPCReg FRt, PPCReg Ra, unsigned short offset = 0); + void SFS (PPCReg FRt, PPCReg Ra, unsigned short offset = 0); + void SFD (PPCReg FRt, PPCReg Ra, unsigned short offset = 0); + void SaveFloatSwap(PPCReg FRt, PPCReg Ra, PPCReg offset); + void LoadFloatSwap(PPCReg FRt, PPCReg Ra, PPCReg offset); + // dest = LIS(imm) + ORI(+imm) + void MOVI2F (PPCReg dest, float imm, bool negate = false); + + // Fpu move instruction + void FMR (PPCReg FRt, PPCReg FRb); + + // fpu + void MTFSB0 (int bt); +#if PPSSPP_ARCH(64BIT) + void FCFID (PPCReg FRt, PPCReg FRb); + void FCTID (PPCReg FRt, PPCReg FRb); +#endif + void FRSP (PPCReg FRt, PPCReg FRb); + void FCTIW (PPCReg FRt, PPCReg FRb); + void STFIWX (PPCReg FRt, PPCReg FRa, PPCReg FRb); + + // Fpu + void FNEG (PPCReg FRt, PPCReg FRb); + void FABS (PPCReg FRt, PPCReg FRb); + void FNABS (PPCReg FRt, PPCReg FRb); + void FCPSGN (PPCReg FRt, PPCReg FRb); + + // Fpu arith + void FADD (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FSUB (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FADDS (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FSUBS (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FMUL (PPCReg FRt, PPCReg FRa, PPCReg FRc); + void FMULS (PPCReg FRt, PPCReg FRa, PPCReg FRc); + void FDIV (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FDIVS (PPCReg FRt, PPCReg FRa, PPCReg FRb); +#if !PPSSPP_ARCH(PPC750) + void FSQRT (PPCReg FRt, PPCReg FRb); + void FSQRTS (PPCReg FRt, PPCReg FRb); + void FSQRTE (PPCReg FRt, PPCReg FRb); + void FSQRTES(PPCReg FRt, PPCReg FRb); +#endif + void FRE (PPCReg FRt, PPCReg FRb); + void FRES (PPCReg FRt, PPCReg FRb); + + // FSEL ... + void FSEL (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb); + void FMIN (PPCReg FRt, PPCReg FRa, PPCReg FRb); + void FMAX (PPCReg FRt, PPCReg FRa, PPCReg FRb); + + // Fpu mul add + void FMADD (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb); + void FMSUB (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb); + void FMADDS (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb); + void FMSUBS (PPCReg FRt, PPCReg FRa, PPCReg FRc, PPCReg FRb); + + // Fpu compare + void FCMPU (int Bf, PPCReg FRa, PPCReg FRb); // unordered + void FCMPO (int Bf, PPCReg FRa, PPCReg FRb); // ordered + + // Fpu convert + void FRIN (PPCReg FRt, PPCReg FRb); // round + void FRIZ (PPCReg FRt, PPCReg FRb); // trunc + void FRIP (PPCReg FRt, PPCReg FRb); // ceil + void FRIM (PPCReg FRt, PPCReg FRb); // floor + + + // VPU - lvx128 + void LoadVector(PPCReg Rd, PPCReg Ra, PPCReg Rb); + void SaveVector(PPCReg Rd, PPCReg Ra, PPCReg Rb); + void LoadVectorSwap(PPCReg Rd, PPCReg Ra, PPCReg Rb); + void SaveVectorSwap(PPCReg Rd, PPCReg Ra, PPCReg Rb); + + void MOVI2V (PPCReg dest, float imm); + + void VADDFP (PPCReg Rd, PPCReg Ra); // Vector Add Floating Point + void VMADDFP (PPCReg Rd, PPCReg Ra, PPCReg Rb); // Vector Multiply Add Floating Point + void VMAXFP (PPCReg Rd, PPCReg Ra); // Vector Maximum Floating Point + void VMINFP (PPCReg Rd, PPCReg Ra); // Vector Minimum Floating Point + void VMSUM3FP (PPCReg Rd, PPCReg Ra); // 3-operand Dot Product + void VMSUM4FP (PPCReg Rd, PPCReg Ra); // 4-operand Dot Product + void VMULFP (PPCReg Rd, PPCReg Ra); // Vector Multiply Floating Point + void VNMSUBFP (PPCReg Rd, PPCReg Ra, PPCReg Rb); // Vector Negate Multiply-Subtract Floating Point + void VSUBFP (PPCReg Rd, PPCReg Ra); // Vector Subtract Floating Point + + void VCMPBFP (PPCReg Rd, PPCReg Ra); // Vector Compare Bounds Floating Point + void VCMPEQFP (PPCReg Rd, PPCReg Ra); // Vector Compare Equal-to-Floating Point + void VCMPGEFP (PPCReg Rd, PPCReg Ra); // Vector Compare Greater-Than-or-Equal-to Floating Point + void VCMPGTFP (PPCReg Rd, PPCReg Ra); // Vector Compare Greater-Than Floating Point + + + + void QuickCallFunction(void *func); + protected: + + }; // class PPCXEmitter + + + // You get memory management for free, plus, you can use all the MOV etc functions without + // having to prefix them with gen-> or something similar. + class PPCXCodeBlock : public CodeBlock + { + private: + void PoisonMemory(int offset) override; + }; + +} // namespace diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index 82c790c36f77..0f7b5f0c175d 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -22,7 +22,9 @@ #include "CPUDetect.h" #include "MemoryUtil.h" +#ifndef PRIx64 #define PRIx64 "llx" +#endif namespace Gen { diff --git a/Core/Config.cpp b/Core/Config.cpp index 3164c9af9b5e..1dc11e6d646e 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -398,7 +398,7 @@ static int DefaultNumWorkers() { } static int DefaultCpuCore() { -#if PPSSPP_ARCH(ARM) || PPSSPP_ARCH(ARM64) || PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) +#if PPSSPP_ARCH(ARM) || PPSSPP_ARCH(ARM64) || PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(POWERPC) return (int)CPUCore::JIT; #else return (int)CPUCore::INTERPRETER; @@ -682,7 +682,7 @@ bool Config::IsBackendEnabled(GPUBackend backend, bool validate) { if (backend == GPUBackend::OPENGL) return false; #endif -#if !PPSSPP_PLATFORM(IOS) +#if !PPSSPP_PLATFORM(IOS) && !PPSSPP_PLATFORM(WIIU) if (validate) { if (backend == GPUBackend::VULKAN && !VulkanMayBeAvailable()) return false; diff --git a/Core/ConfigValues.h b/Core/ConfigValues.h index 8936f638097e..095b053c0d88 100644 --- a/Core/ConfigValues.h +++ b/Core/ConfigValues.h @@ -56,6 +56,7 @@ enum class GPUBackend { DIRECT3D9 = 1, DIRECT3D11 = 2, VULKAN = 3, + GX2 = 4, }; inline std::string GPUBackendToString(GPUBackend backend) { diff --git a/Core/CoreParameter.h b/Core/CoreParameter.h index a4e542bf0280..7e71af22c823 100644 --- a/Core/CoreParameter.h +++ b/Core/CoreParameter.h @@ -29,6 +29,7 @@ enum GPUCore { GPUCORE_DIRECTX9, GPUCORE_DIRECTX11, GPUCORE_VULKAN, + GPUCORE_GX2, }; enum class FPSLimit { diff --git a/Core/Dialog/PSPNetconfDialog.h b/Core/Dialog/PSPNetconfDialog.h index e02ebae5a00f..eb6ed5d032dc 100644 --- a/Core/Dialog/PSPNetconfDialog.h +++ b/Core/Dialog/PSPNetconfDialog.h @@ -22,16 +22,16 @@ struct SceUtilityNetconfData { char groupName[8]; - int timeout; + s32_le timeout; }; struct SceUtilityNetconfParam { pspUtilityDialogCommon common; - int netAction; // sets how to connect + s32_le netAction; // sets how to connect PSPPointer NetconfData; - int netHotspot; // Flag to allow hotspot connections - int netHotspotConnected; // Flag to check if a hotspot connection is active - int netWifiSpot; // Flag to allow WIFI connections + s32_le netHotspot; // Flag to allow hotspot connections + s32_le netHotspotConnected; // Flag to check if a hotspot connection is active + s32_le netWifiSpot; // Flag to allow WIFI connections }; diff --git a/Core/Dialog/PSPOskDialog.h b/Core/Dialog/PSPOskDialog.h index a244fd419a92..896057822eb9 100644 --- a/Core/Dialog/PSPOskDialog.h +++ b/Core/Dialog/PSPOskDialog.h @@ -28,7 +28,7 @@ /** * Enumeration for input language */ -enum SceUtilityOskInputLanguage +enum SceUtilityOskInputLanguage : u32 { PSP_UTILITY_OSK_LANGUAGE_DEFAULT = 0x00, PSP_UTILITY_OSK_LANGUAGE_JAPANESE = 0x01, @@ -46,7 +46,7 @@ enum SceUtilityOskInputLanguage /** * Enumeration for OSK internal state */ -enum SceUtilityOskState +enum SceUtilityOskState : u32 { PSP_UTILITY_OSK_DIALOG_NONE = 0, /**< No OSK is currently active */ PSP_UTILITY_OSK_DIALOG_INITING = 1, /**< The OSK is currently being initialized */ @@ -59,7 +59,7 @@ enum SceUtilityOskState /** * Enumeration for OSK field results */ -enum SceUtilityOskResult +enum SceUtilityOskResult : u32 { PSP_UTILITY_OSK_RESULT_UNCHANGED = 0, PSP_UTILITY_OSK_RESULT_CANCELLED = 1, @@ -94,16 +94,6 @@ enum SceUtilityOskInputType PSP_UTILITY_OSK_INPUTTYPE_URL = 0x00080000 }; -#if COMMON_LITTLE_ENDIAN -typedef SceUtilityOskState SceUtilityOskState_le; -typedef SceUtilityOskInputLanguage SceUtilityOskInputLanguage_le; -typedef SceUtilityOskResult SceUtilityOskResult_le; -#else -typedef swap_struct_t > SceUtilityOskState_le; -typedef swap_struct_t > SceUtilityOskInputLanguage_le; -typedef swap_struct_t > SceUtilityOskResult_le; -#endif - /** * OSK Field data */ @@ -114,7 +104,7 @@ struct SceUtilityOskData /** Unknown. Pass 0. */ s32_le unk_04; /** One of ::SceUtilityOskInputLanguage */ - SceUtilityOskInputLanguage_le language; + LEndian language; /** Unknown. Pass 0. */ s32_le unk_12; /** One or more of ::SceUtilityOskInputType (types that are selectable by pressing SELECT) */ @@ -132,7 +122,7 @@ struct SceUtilityOskData /** Pointer to the output text */ PSPPointer outtext; /** Result. One of ::SceUtilityOskResult */ - SceUtilityOskResult_le result; + LEndian result; // Number of characters to allow, not including terminator (if less than outtextlength - 1.) u32_le outtextlimit; }; @@ -145,7 +135,7 @@ struct SceUtilityOskParams s32_le fieldCount; // Pointer to an array of fields (see SceUtilityOskData.) PSPPointer fields; - SceUtilityOskState_le state; + LEndian state; // Maybe just padding? s32_le unk_60; diff --git a/Core/Dialog/PSPScreenshotDialog.cpp b/Core/Dialog/PSPScreenshotDialog.cpp index acec8d08d449..f5a5c6c0a88b 100644 --- a/Core/Dialog/PSPScreenshotDialog.cpp +++ b/Core/Dialog/PSPScreenshotDialog.cpp @@ -23,7 +23,7 @@ #include "Core/MemMap.h" #include "Core/Reporting.h" -enum SceUtilityScreenshotType { +enum SceUtilityScreenshotType : u32 { SCE_UTILITY_SCREENSHOT_TYPE_GUI = 0, SCE_UTILITY_SCREENSHOT_TYPE_AUTO = 1, SCE_UTILITY_SCREENSHOT_TYPE_SAVE = 2, @@ -39,15 +39,9 @@ static const int SCE_UTILITY_SCREENSHOTDIALOG_SIZE_V1 = 436; static const int SCE_UTILITY_SCREENSHOTDIALOG_SIZE_V2 = 928; static const int SCE_UTILITY_SCREENSHOTDIALOG_SIZE_V3 = 932; -#if COMMON_LITTLE_ENDIAN -typedef SceUtilityScreenshotType SceUtilityScreenshotType_le; -#else -typedef swap_struct_t > SceUtilityScreenshotType_le; -#endif - struct SceUtilityScreenshotParams { pspUtilityDialogCommon base; - SceUtilityScreenshotType_le mode; + LEndian mode; // TODO }; diff --git a/Core/Dialog/SavedataParam.cpp b/Core/Dialog/SavedataParam.cpp index 37776d489e6d..ea5bce8fbf74 100644 --- a/Core/Dialog/SavedataParam.cpp +++ b/Core/Dialog/SavedataParam.cpp @@ -615,7 +615,7 @@ int SavedataParam::LoadSaveData(SceUtilitySavedataParam *param, const std::strin std::string filename = GetFileName(param); std::string filePath = dirPath + "/" + filename; s64 readSize; - INFO_LOG(SCEUTILITY,"Loading file with size %u in %s",param->dataBufSize,filePath.c_str()); + INFO_LOG(SCEUTILITY,"Loading file with size %u in %s",(u32)param->dataBufSize,filePath.c_str()); u8 *saveData = nullptr; int saveSize = -1; if (!ReadPSPFile(filePath, &saveData, saveSize, &readSize)) { diff --git a/Core/Dialog/SavedataParam.h b/Core/Dialog/SavedataParam.h index 857c69779baa..fac60542103d 100644 --- a/Core/Dialog/SavedataParam.h +++ b/Core/Dialog/SavedataParam.h @@ -31,7 +31,7 @@ class PPGeImage; struct PSPFileInfo; typedef u32_le SceSize_le; -enum SceUtilitySavedataType +enum SceUtilitySavedataType : u32 { SCE_UTILITY_SAVEDATA_TYPE_AUTOLOAD = 0, SCE_UTILITY_SAVEDATA_TYPE_AUTOSAVE = 1, @@ -84,7 +84,7 @@ static const char *const utilitySavedataTypeNames[] = { "GETSIZE", }; -enum SceUtilitySavedataFocus +enum SceUtilitySavedataFocus : u32 { SCE_UTILITY_SAVEDATA_FOCUS_NAME = 0, // specified by saveName[] SCE_UTILITY_SAVEDATA_FOCUS_FIRSTLIST = 1, // first listed (on screen or of all?) @@ -97,14 +97,6 @@ enum SceUtilitySavedataFocus SCE_UTILITY_SAVEDATA_FOCUS_LASTEMPTY = 8, // last empty (what if no empty?) }; -#if COMMON_LITTLE_ENDIAN -typedef SceUtilitySavedataType SceUtilitySavedataType_le; -typedef SceUtilitySavedataFocus SceUtilitySavedataFocus_le; -#else -typedef swap_struct_t > SceUtilitySavedataType_le; -typedef swap_struct_t > SceUtilitySavedataFocus_le; -#endif - typedef char SceUtilitySavedataSaveName[20]; // title, savedataTitle, detail: parts of the unencrypted SFO @@ -215,7 +207,7 @@ struct SceUtilitySavedataParam { pspUtilityDialogCommon common; - SceUtilitySavedataType_le mode; // 0 to load, 1 to save + LEndian mode; // 0 to load, 1 to save s32_le bind; s32_le overwriteMode; // use 0x10 ? @@ -244,7 +236,7 @@ struct SceUtilitySavedataParam PspUtilitySavedataFileData snd0FileData; PSPPointer newData; - SceUtilitySavedataFocus_le focus; + LEndian focus; s32_le abortStatus; // Function SCE_UTILITY_SAVEDATA_TYPE_SIZES diff --git a/Core/ELF/ElfReader.cpp b/Core/ELF/ElfReader.cpp index 942236c68a8a..77450526a7bf 100644 --- a/Core/ELF/ElfReader.cpp +++ b/Core/ELF/ElfReader.cpp @@ -357,7 +357,7 @@ void ElfReader::LoadRelocations2(int rel_seg) int ElfReader::LoadInto(u32 loadAddress, bool fromTop) { - DEBUG_LOG(LOADER,"String section: %i", header->e_shstrndx); + DEBUG_LOG(LOADER,"String section: %i", (int)header->e_shstrndx); if (header->e_ident[0] != ELFMAG0 || header->e_ident[1] != ELFMAG1 || header->e_ident[2] != ELFMAG2 || header->e_ident[3] != ELFMAG3) @@ -450,7 +450,7 @@ int ElfReader::LoadInto(u32 loadAddress, bool fromTop) DEBUG_LOG(LOADER,"Prerelocated executable"); } - DEBUG_LOG(LOADER,"%i segments:", header->e_phnum); + DEBUG_LOG(LOADER,"%i segments:", (int)header->e_phnum); // First pass : Get the damn bits into RAM u32 baseAddress = bRelocate ? vaddr : 0; @@ -482,7 +482,7 @@ int ElfReader::LoadInto(u32 loadAddress, bool fromTop) } memblock.ListBlocks(); - DEBUG_LOG(LOADER,"%i sections:", header->e_shnum); + DEBUG_LOG(LOADER,"%i sections:", (int)header->e_shnum); for (int i = 0; i < GetNumSections(); i++) { @@ -527,7 +527,7 @@ int ElfReader::LoadInto(u32 loadAddress, bool fromTop) Elf32_Rel *rels = (Elf32_Rel *)GetSectionDataPtr(i); - DEBUG_LOG(LOADER,"%s: Performing %i relocations on %s : offset = %08x", name, numRelocs, GetSectionName(sectionToModify), sections[i].sh_offset); + DEBUG_LOG(LOADER,"%s: Performing %i relocations on %s : offset = %08x", name, numRelocs, GetSectionName(sectionToModify), (u32)sections[i].sh_offset); if (!LoadRelocations(rels, numRelocs)) { WARN_LOG(LOADER, "LoadInto: Relocs failed, trying anyway"); } diff --git a/Core/ELF/ElfReader.h b/Core/ELF/ElfReader.h index 44d33a8e8eca..9a1ba62f7a24 100644 --- a/Core/ELF/ElfReader.h +++ b/Core/ELF/ElfReader.h @@ -49,7 +49,7 @@ class ElfReader { public: ElfReader(const void *ptr, size_t size) { base = (const char*)ptr; - base32 = (const u32 *)ptr; + base32 = (const u32_le *)ptr; header = (const Elf32_Ehdr*)ptr; segments = (const Elf32_Phdr *)(base + header->e_phoff); sections = (const Elf32_Shdr *)(base + header->e_shoff); @@ -139,7 +139,7 @@ class ElfReader { private: const char *base = nullptr; - const u32 *base32 = nullptr; + const u32_le *base32 = nullptr; const Elf32_Ehdr *header = nullptr; const Elf32_Phdr *segments = nullptr; const Elf32_Shdr *sections = nullptr; diff --git a/Core/ELF/ParamSFO.cpp b/Core/ELF/ParamSFO.cpp index b12b5089425a..1780ec37a3c9 100644 --- a/Core/ELF/ParamSFO.cpp +++ b/Core/ELF/ParamSFO.cpp @@ -21,25 +21,26 @@ #include "Common/CommonTypes.h" #include "Common/Log.h" #include "Common/StringUtils.h" +#include "Common/Swap.h" #include "Core/ELF/ParamSFO.h" #include "Core/Core.h" struct Header { - u32 magic; /* Always PSF */ - u32 version; /* Usually 1.1 */ - u32 key_table_start; /* Start position of key_table */ - u32 data_table_start; /* Start position of data_table */ - u32 index_table_entries; /* Number of entries in index_table*/ + u32_le magic; /* Always PSF */ + u32_le version; /* Usually 1.1 */ + u32_le key_table_start; /* Start position of key_table */ + u32_le data_table_start; /* Start position of data_table */ + u32_le index_table_entries; /* Number of entries in index_table*/ }; struct IndexTable { - u16 key_table_offset; /* Offset of the param_key from start of key_table */ - u16 param_fmt; /* Type of data of param_data in the data_table */ - u32 param_len; /* Used Bytes by param_data in the data_table */ - u32 param_max_len; /* Total bytes reserved for param_data in the data_table */ - u32 data_table_offset; /* Offset of the param_data from start of data_table */ + u16_le key_table_offset; /* Offset of the param_key from start of key_table */ + u16_le param_fmt; /* Type of data of param_data in the data_table */ + u32_le param_len; /* Used Bytes by param_data in the data_table */ + u32_le param_max_len; /* Total bytes reserved for param_data in the data_table */ + u32_le data_table_offset; /* Offset of the param_data from start of data_table */ }; void ParamSFOData::SetValue(std::string key, unsigned int value, int max_size) { @@ -113,7 +114,7 @@ bool ParamSFOData::ReadSFO(const u8 *paramsfo, size_t size) { case 0x0404: { // Unsigned int - const u32 *data = (const u32 *)(data_start + indexTables[i].data_table_offset); + const u32_le *data = (const u32_le *)(data_start + indexTables[i].data_table_offset); SetValue(key,*data,indexTables[i].param_max_len); VERBOSE_LOG(LOADER, "%s %08x", key, *data); } @@ -218,7 +219,7 @@ bool ParamSFOData::WriteSFO(u8 **paramsfo, size_t *size) { index_ptr->param_fmt = 0x0404; index_ptr->param_len = 4; - *(int*)data_ptr = it->second.i_value; + *(s32_le*)data_ptr = it->second.i_value; } else if (it->second.type == VT_UTF8_SPE) { diff --git a/Core/ELF/PrxDecrypter.cpp b/Core/ELF/PrxDecrypter.cpp index f87fa91fd753..3b3d28e59301 100644 --- a/Core/ELF/PrxDecrypter.cpp +++ b/Core/ELF/PrxDecrypter.cpp @@ -152,119 +152,119 @@ static const u8 pauth_f7aa47f6_2[] = {0x3A, 0x6B, 0x48, 0x96, 0x86, 0xA5, 0xC8, static const u8 pauth_f7aa47f6_xor[] = {0xA9, 0x1E, 0xDD, 0x7B, 0x09, 0xBB, 0x22, 0xB5, 0x9D, 0xA3, 0x30, 0x69, 0x13, 0x6E, 0x0E, 0xD8}; // PRXDecrypter 144-byte tag keys. -static const u32 g_key0[] = { +static const u32_le g_key0[] = { 0x7b21f3be, 0x299c5e1d, 0x1c9c5e71, 0x96cb4645, 0x3c9b1be0, 0xeb85de3d, 0x4a7f2022, 0xc2206eaa, 0xd50b3265, 0x55770567, 0x3c080840, 0x981d55f2, 0x5fd8f6f3, 0xee8eb0c5, 0x944d8152, 0xf8278651, 0x2705bafa, 0x8420e533, 0x27154ae9, 0x4819aa32, 0x59a3aa40, 0x2cb3cf65, 0xf274466d, 0x3a655605, 0x21b0f88f, 0xc5b18d26, 0x64c19051, 0xd669c94e, 0xe87035f2, 0x9d3a5909, 0x6f4e7102, 0xdca946ce, 0x8416881b, 0xbab097a5, 0x249125c6, 0xb34c0872}; -static const u32 g_key2[] = { +static const u32_le g_key2[] = { 0xccfda932, 0x51c06f76, 0x046dcccf, 0x49e1821e, 0x7d3b024c, 0x9dda5865, 0xcc8c9825, 0xd1e97db5, 0x6874d8cb, 0x3471c987, 0x72edb3fc, 0x81c8365d, 0xe161e33a, 0xfc92db59, 0x2009b1ec, 0xb1a94ce4, 0x2f03696b, 0x87e236d8, 0x3b2b8ce9, 0x0305e784, 0xf9710883, 0xb039db39, 0x893bea37, 0xe74d6805, 0x2a5c38bd, 0xb08dc813, 0x15b32375, 0x46be4525, 0x0103fd90, 0xa90e87a2, 0x52aba66a, 0x85bf7b80, 0x45e8ce63, 0x4dd716d3, 0xf5e30d2d, 0xaf3ae456}; -static const u32 g_key3[] = { +static const u32_le g_key3[] = { 0xa6c8f5ca, 0x6d67c080, 0x924f4d3a, 0x047ca06a, 0x08640297, 0x4fd4a758, 0xbd685a87, 0x9b2701c2, 0x83b62a35, 0x726b533c, 0xe522fa0c, 0xc24b06b4, 0x459d1cac, 0xa8c5417b, 0x4fea62a2, 0x0615d742, 0x30628d09, 0xc44fab14, 0x69ff715e, 0xd2d8837d, 0xbeed0b8b, 0x1e6e57ae, 0x61e8c402, 0xbe367a06, 0x543f2b5e, 0xdb3ec058, 0xbe852075, 0x1e7e4dcc, 0x1564ea55, 0xec7825b4, 0xc0538cad, 0x70f72c7f, 0x49e8c3d0, 0xeda97ec5, 0xf492b0a4, 0xe05eb02a}; -static const u32 g_key44[] = { +static const u32_le g_key44[] = { 0xef80e005, 0x3a54689f, 0x43c99ccd, 0x1b7727be, 0x5cb80038, 0xdd2efe62, 0xf369f92c, 0x160f94c5, 0x29560019, 0xbf3c10c5, 0xf2ce5566, 0xcea2c626, 0xb601816f, 0x64e7481e, 0x0c34debd, 0x98f29cb0, 0x3fc504d7, 0xc8fb39f0, 0x0221b3d8, 0x63f936a2, 0x9a3a4800, 0x6ecc32e3, 0x8e120cfd, 0xb0361623, 0xaee1e689, 0x745502eb, 0xe4a6c61c, 0x74f23eb4, 0xd7fa5813, 0xb01916eb, 0x12328457, 0xd2bc97d2, 0x646425d8, 0x328380a5, 0x43da8ab1, 0x4b122ac9}; -static const u32 g_key20[] = { +static const u32_le g_key20[] = { 0x33b50800, 0xf32f5fcd, 0x3c14881f, 0x6e8a2a95, 0x29feefd5, 0x1394eae3, 0xbd6bd443, 0x0821c083, 0xfab379d3, 0xe613e165, 0xf5a754d3, 0x108b2952, 0x0a4b1e15, 0x61eadeba, 0x557565df, 0x3b465301, 0xae54ecc3, 0x61423309, 0x70c9ff19, 0x5b0ae5ec, 0x989df126, 0x9d987a5f, 0x55bc750e, 0xc66eba27, 0x2de988e8, 0xf76600da, 0x0382dccb, 0x5569f5f2, 0x8e431262, 0x288fe3d3, 0x656f2187, 0x37d12e9c, 0x2f539eb4, 0xa492998e, 0xed3958f7, 0x39e96523}; -static const u32 g_key3A[] = { +static const u32_le g_key3A[] = { 0x67877069, 0x3abd5617, 0xc23ab1dc, 0xab57507d, 0x066a7f40, 0x24def9b9, 0x06f759e4, 0xdcf524b1, 0x13793e5e, 0x0359022d, 0xaae7e1a2, 0x76b9b2fa, 0x9a160340, 0x87822fba, 0x19e28fbb, 0x9e338a02, 0xd8007e9a, 0xea317af1, 0x630671de, 0x0b67ca7c, 0x865192af, 0xea3c3526, 0x2b448c8e, 0x8b599254, 0x4602e9cb, 0x4de16cda, 0xe164d5bb, 0x07ecd88e, 0x99ffe5f8, 0x768800c1, 0x53b091ed, 0x84047434, 0xb426dbbc, 0x36f948bb, 0x46142158, 0x749bb492}; -static const u32 g_keyEBOOT1xx[] = { +static const u32_le g_keyEBOOT1xx[] = { 0x18CB69EF, 0x158E8912, 0xDEF90EBB, 0x4CB0FB23, 0x3687EE18, 0x868D4A6E, 0x19B5C756, 0xEE16551D, 0xE7CB2D6C, 0x9747C660, 0xCE95143F, 0x2956F477, 0x03824ADE, 0x210C9DF1, 0x5029EB24, 0x81DFE69F, 0x39C89B00, 0xB00C8B91, 0xEF2DF9C2, 0xE13A93FC, 0x8B94A4A8, 0x491DD09D, 0x686A400D, 0xCED4C7E4, 0x96C8B7C9, 0x1EAADC28, 0xA4170B84, 0x505D5DDC, 0x5DA6C3CF, 0x0E5DFA2D, 0x6E7919B5, 0xCE5E29C7, 0xAAACDB94, 0x45F70CDD, 0x62A73725, 0xCCE6563D}; -static const u32 g_keyEBOOT2xx[] = { +static const u32_le g_keyEBOOT2xx[] = { 0xDA8E36FA, 0x5DD97447, 0x76C19874, 0x97E57EAF, 0x1CAB09BD, 0x9835BAC6, 0x03D39281, 0x03B205CF, 0x2882E734, 0xE714F663, 0xB96E2775, 0xBD8AAFC7, 0x1DD3EC29, 0xECA4A16C, 0x5F69EC87, 0x85981E92, 0x7CFCAE21, 0xBAE9DD16, 0xE6A97804, 0x2EEE02FC, 0x61DF8A3D, 0xDD310564, 0x9697E149, 0xC2453F3B, 0xF91D8456, 0x39DA6BC8, 0xB3E5FEF5, 0x89C593A3, 0xFB5C8ABC, 0x6C0B7212, 0xE10DD3CB, 0x98D0B2A8, 0x5FD61847, 0xF0DC2357, 0x7701166A, 0x0F5C3B68}; -static const u32 g_demokeys_280[] = { +static const u32_le g_demokeys_280[] = { 0x2A5282B4, 0x8706DDA5, 0x4C88EC1C, 0xD504708E, 0x72634DD2, 0xDD2E2F60, 0xE3D5FDB5, 0xE050637D, 0x295C69AC, 0x7B61F57D, 0x594412B0, 0x13D925CE, 0x2A6BE8DD, 0xBC9594E6, 0x1F4A8A39, 0xC56B5909, 0x52CFB2F7, 0x03EE089F, 0x5CA57A21, 0xDB64090F, 0x5E9A56F3, 0x13C56633, 0xD9C48D1D, 0xCDA05972, 0xD09E13B2, 0x7DEDD3DF, 0x364387BB, 0xCB207488, 0xBEC14B3F, 0x7C9C0D11, 0x9916ED40, 0x65909519, 0xC55BB1B3, 0xE997E084, 0xB483438B, 0xB8A2D255}; -static const u32 g_keyUPDATER[] = { +static const u32_le g_keyUPDATER[] = { 0xA5603CBF, 0xD7482441, 0xF65764CC, 0x1F90060B, 0x4EA73E45, 0xE551D192, 0xE7B75D8A, 0x465A506E, 0x40FB1022, 0x2C273350, 0x8096DA44, 0x9947198E, 0x278DEE77, 0x745D062E, 0xC148FA45, 0x832582AF, 0x5FDB86DA, 0xCB15C4CE, 0x2524C62F, 0x6C2EC3B1, 0x369BE39E, 0xF7EB1FC4, 0x1E51CE1A, 0xD70536F4, 0xC34D39D8, 0x7418FB13, 0xE3C84DE1, 0xB118F03C, 0xA2018D4E, 0xE6D8770D, 0x5720F390, 0x17F96341, 0x60A4A68F, 0x1327DD28, 0x05944C64, 0x0C2C4C12}; -static const u32 g_keyMEIMG250[] = { +static const u32_le g_keyMEIMG250[] = { 0xA381FEBC, 0x99B9D5C9, 0x6C560A8D, 0x30309F95, 0x792646CC, 0x82B64E5E, 0x1A3951AD, 0x0A182EC4, 0xC46131B4, 0x77C50C8A, 0x325F16C6, 0x02D1942E, 0x0AA38AC4, 0x2A940AC6, 0x67034726, 0xE52DB133, 0xD2EF2107, 0x85C81E90, 0xC8D164BA, 0xC38DCE1D, 0x948BA275, 0x0DB84603, 0xE2473637, 0xCD74FCDA, 0x588E3D66, 0x6D28E822, 0x891E548B, 0xF53CF56D, 0x0BBDDB66, 0xC4B286AA, 0x2BEBBC4B, 0xFC261FF4, 0x92B8E705, 0xDCEE6952, 0x5E0442E5, 0x8BEB7F21}; -static const u32 g_keyMEIMG260[] = { +static const u32_le g_keyMEIMG260[] = { 0x11BFD698, 0xD7F9B324, 0xDD524927, 0x16215B86, 0x504AC36D, 0x5843B217, 0xE5A0DA47, 0xBB73A1E7, 0x2915DB35, 0x375CFD3A, 0xBB70A905, 0x272BEFCA, 0x2E960791, 0xEA0799BB, 0xB85AE6C8, 0xC9CAF773, 0x250EE641, 0x06E74A9E, 0x5244895D, 0x466755A5, 0x9A84AF53, 0xE1024174, 0xEEBA031E, 0xED80B9CE, 0xBC315F72, 0x5821067F, 0xE8313058, 0xD2D0E706, 0xE6D8933E, 0xD7D17FB4, 0x505096C4, 0xFDA50B3B, 0x4635AE3D, 0xEB489C8A, 0x422D762D, 0x5A8B3231}; -static const u32 g_keyDEMOS27X[] = { +static const u32_le g_keyDEMOS27X[] = { 0x1ABF102F, 0xD596D071, 0x6FC552B2, 0xD4F2531F, 0xF025CDD9, 0xAF9AAF03, 0xE0CF57CF, 0x255494C4, 0x7003675E, 0x907BC884, 0x002D4EE4, 0x0B687A0D, 0x9E3AA44F, 0xF58FDA81, 0xEC26AC8C, 0x3AC9B49D, 0x3471C037, 0xB0F3834D, 0x10DC4411, 0xA232EA31, 0xE2E5FA6B, 0x45594B03, 0xE43A1C87, 0x31DAD9D1, 0x08CD7003, 0xFA9C2FDF, 0x5A891D25, 0x9B5C1934, 0x22F366E5, 0x5F084A32, 0x695516D5, 0x2245BE9F, 0x4F6DD705, 0xC4B8B8A1, 0xBC13A600, 0x77B7FC3B}; -static const u32 g_keyUNK1[] = { +static const u32_le g_keyUNK1[] = { 0x33B50800, 0xF32F5FCD, 0x3C14881F, 0x6E8A2A95, 0x29FEEFD5, 0x1394EAE3, 0xBD6BD443, 0x0821C083, 0xFAB379D3, 0xE613E165, 0xF5A754D3, 0x108B2952, 0x0A4B1E15, 0x61EADEBA, 0x557565DF, 0x3B465301, 0xAE54ECC3, 0x61423309, 0x70C9FF19, 0x5B0AE5EC, 0x989DF126, 0x9D987A5F, 0x55BC750E, 0xC66EBA27, 0x2DE988E8, 0xF76600DA, 0x0382DCCB, 0x5569F5F2, 0x8E431262, 0x288FE3D3, 0x656F2187, 0x37D12E9C, 0x2F539EB4, 0xA492998E, 0xED3958F7, 0x39E96523}; -static const u32 g_key_GAMESHARE1xx[] = { +static const u32_le g_key_GAMESHARE1xx[] = { 0x721B53E8, 0xFC3E31C6, 0xF85BA2A2, 0x3CF0AC72, 0x54EEA7AB, 0x5959BFCB, 0x54B8836B, 0xBC431313, 0x989EF2CF, 0xF0CE36B2, 0x98BA4CF8, 0xE971C931, 0xA0375DC8, 0x08E52FA0, 0xAC0DD426, 0x57E4D601, 0xC56E61C7, 0xEF1AB98A, 0xD1D9F8F4, 0x5FE9A708, 0x3EF09D07, 0xFA0C1A8C, 0xA91EEA5C, 0x58F482C5, 0x2C800302, 0x7EE6F6C3, 0xFF6ABBBB, 0x2110D0D0, 0xD3297A88, 0x980012D3, 0xDC59C87B, 0x7FDC5792, 0xDB3F5DA6, 0xFC23B787, 0x22698ED3, 0xB680E812}; -static const u32 g_key_GAMESHARE2xx[] = { +static const u32_le g_key_GAMESHARE2xx[] = { 0x94A757C7, 0x9FD39833, 0xF8508371, 0x328B0B29, 0x2CBCB9DA, 0x2918B9C6, 0x944C50BA, 0xF1DCE7D0, 0x640C3966, 0xC90B3D08, 0xF4AD17BA, 0x6CA0F84B, 0xF7767C67, 0xA4D3A55A, 0x4A085C6A, 0x6BB27071, 0xFA8B38FB, 0x3FDB31B8, 0x8B7196F2, 0xDB9BED4A, 0x51625B84, 0x4C1481B4, 0xF684F508, 0x30B44770, 0x93AA8E74, 0x90C579BC, 0x246EC88D, 0x2E051202, 0xC774842E, 0xA185D997, 0x7A2B3ADD, 0xFE835B6D, 0x508F184D, 0xEB4C4F13, 0x0E1993D3, 0xBA96DFD2}; -static const u32 g_key_INDEXDAT1xx[] = { +static const u32_le g_key_INDEXDAT1xx[] = { 0x76CB00AF, 0x111CE62F, 0xB7B27E36, 0x6D8DE8F9, 0xD54BF16A, 0xD9E90373, 0x7599D982, 0x51F82B0E, 0x636103AD, 0x8E40BC35, 0x2F332C94, 0xF513AAE9, 0xD22AFEE9, 0x04343987, 0xFC5BB80C, 0x12349D89, 0x14A481BB, 0x25ED3AE8, @@ -275,7 +275,7 @@ static const u32 g_key_INDEXDAT1xx[] = { struct TAG_INFO { u32 tag; // 4 byte value at offset 0xD0 in the PRX file - const u32 *key; // "step1_result" use for XOR step + const u32_le *key; // "step1_result" use for XOR step u8 code; u8 codeExtra; }; @@ -709,7 +709,7 @@ static int pspDecryptType0(const u8 *inbuf, u8 *outbuf, u32 size) // no need to expand seed, and no need to decrypt // normally this would be a kirk7 op, but we have the seed pre-decrypted std::array xorbuf; - memcpy(xorbuf.data(), reinterpret_cast(pti->key), xorbuf.size()); + memcpy(xorbuf.data(), pti->key, xorbuf.size()); // construct the header format for a type 0 prx PRXType0 type0(inbuf); @@ -763,7 +763,7 @@ static int pspDecryptType1(const u8 *inbuf, u8 *outbuf, u32 size) // no need to expand seed, and no need to decrypt // normally this would be a kirk7 op, but we have the seed pre-decrypted std::array xorbuf; - memcpy(xorbuf.data(), reinterpret_cast(pti->key), xorbuf.size()); + memcpy(xorbuf.data(), pti->key, xorbuf.size()); // construct the header format for a type 1 prx PRXType1 type1(inbuf); @@ -858,7 +858,7 @@ static int pspDecryptType2(const u8 *inbuf, u8 *outbuf, u32 size) memcpy(reinterpret_cast(&header->data_size), type2.kirkMetadata, sizeof(type2.kirkMetadata)); memcpy(reinterpret_cast(header)+sizeof(KIRK_CMD1_HEADER), type2.prxHeader, sizeof(type2.prxHeader)); decryptKirkHeader(reinterpret_cast(header), type2.kirkHeader, xorbuf.cbegin()+0x10, pti->code); - header->mode = 1; + *(u32_le*)&header->mode = 1; if (kirk_sceUtilsBufferCopyWithRange(outbuf, size, reinterpret_cast(header), size - offset, KIRK_CMD_DECRYPT_PRIVATE) != 0) { @@ -922,7 +922,7 @@ static int pspDecryptType5(const u8 *inbuf, u8 *outbuf, u32 size, const u8 *seed memcpy(reinterpret_cast(&header->data_size), type5.kirkMetadata, sizeof(type5.kirkMetadata)); memcpy(reinterpret_cast(header)+sizeof(KIRK_CMD1_HEADER), type5.prxHeader, sizeof(type5.prxHeader)); decryptKirkHeader(reinterpret_cast(header), type5.kirkHeader, xorbuf.cbegin()+0x10, pti->code); - header->mode = 1; + *(u32_le*)&header->mode = 1; if (kirk_sceUtilsBufferCopyWithRange(outbuf, size, reinterpret_cast(header), size - offset, KIRK_CMD_DECRYPT_PRIVATE) != 0) { @@ -988,7 +988,7 @@ static int pspDecryptType6(const u8 *inbuf, u8 *outbuf, u32 size) memcpy(reinterpret_cast(&header->data_size), type6.kirkMetadata, sizeof(type6.kirkMetadata)); memcpy(reinterpret_cast(header)+sizeof(KIRK_CMD1_ECDSA_HEADER), type6.prxHeader, sizeof(type6.prxHeader)); decryptKirkHeader(reinterpret_cast(header), type6.kirkHeader, xorbuf.cbegin()+0x10, pti->code); - header->mode = 1; + *(u32_le*)&header->mode = 1; header->ecdsa_hash = 1; if (kirk_sceUtilsBufferCopyWithRange(outbuf, size, reinterpret_cast(header), size - offset, KIRK_CMD_DECRYPT_PRIVATE) != 0) diff --git a/Core/ELF/PrxDecrypter.h b/Core/ELF/PrxDecrypter.h index 24f20d41b570..15b62890f2e7 100644 --- a/Core/ELF/PrxDecrypter.h +++ b/Core/ELF/PrxDecrypter.h @@ -54,14 +54,11 @@ typedef struct u32_le key_data2; // 12C u32_le oe_tag; // 130 u8 key_data3[0x1C]; // 134 -#ifdef _MSC_VER } PSP_Header; -#else -} __attribute__((packed)) PSP_Header; -#endif - #ifdef _MSC_VER #pragma pack(pop) #endif +static_assert(sizeof(PSP_Header) == 0x150, "sizeof(PSP_Header) != 0x150"); + int pspDecryptPRX(const u8 *inbuf, u8 *outbuf, u32 size, const u8 *seed = nullptr); diff --git a/Core/FileLoaders/LocalFileLoader.cpp b/Core/FileLoaders/LocalFileLoader.cpp index 8e757b331545..03a2162e8818 100644 --- a/Core/FileLoaders/LocalFileLoader.cpp +++ b/Core/FileLoaders/LocalFileLoader.cpp @@ -29,6 +29,10 @@ #include #endif +#ifndef O_CLOEXEC +#define O_CLOEXEC 0 +#endif + LocalFileLoader::LocalFileLoader(const std::string &filename) : filesize_(0), filename_(filename) { if (filename.empty()) { @@ -136,6 +140,10 @@ size_t LocalFileLoader::ReadAt(s64 absolutePos, size_t bytes, size_t count, void lseek64(fd_, absolutePos, SEEK_SET); return read(fd_, data, bytes * count) / bytes; } +#elif defined(__wiiu__) + std::lock_guard guard(readLock_); + lseek(fd_, absolutePos, SEEK_SET); + return read(fd_, data, bytes * count) / bytes; #elif !defined(_WIN32) #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS < 64 return pread64(fd_, data, bytes * count, absolutePos) / bytes; diff --git a/Core/FileSystems/BlockDevices.cpp b/Core/FileSystems/BlockDevices.cpp index b8f2b4432b11..2059c94ba0eb 100644 --- a/Core/FileSystems/BlockDevices.cpp +++ b/Core/FileSystems/BlockDevices.cpp @@ -169,7 +169,7 @@ CISOFileBlockDevice::CISOFileBlockDevice(FileLoader *fileLoader) const u32 indexSize = numFrames + 1; const size_t headerEnd = hdr.ver > 1 ? (size_t)hdr.header_size : sizeof(hdr); -#if COMMON_LITTLE_ENDIAN +#if __LITTLE_ENDIAN__ index = new u32[indexSize]; if (fileLoader->ReadAt(headerEnd, sizeof(u32), indexSize, index) != indexSize) { NotifyReadError(); @@ -386,6 +386,7 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader) u32 lbaStart, lbaEnd; fileLoader_->ReadAt(0x24, 1, 4, &psarOffset); + psarOffset = *(u32_le*)&psarOffset; size_t readSize = fileLoader_->ReadAt(psarOffset, 1, 256, &np_header); if(readSize!=256){ ERROR_LOG(LOADER, "Invalid NPUMDIMG header!"); @@ -404,17 +405,17 @@ NPDRMDemoBlockDevice::NPDRMDemoBlockDevice(FileLoader *fileLoader) sceDrmBBCipherUpdate(&ckey, np_header+0x40, 0x60); sceDrmBBCipherFinal(&ckey); - lbaStart = *(u32*)(np_header+0x54); // LBA start - lbaEnd = *(u32*)(np_header+0x64); // LBA end + lbaStart = *(u32_le*)(np_header+0x54); // LBA start + lbaEnd = *(u32_le*)(np_header+0x64); // LBA end lbaSize = (lbaEnd-lbaStart+1); // LBA size of ISO - blockLBAs = *(u32*)(np_header+0x0c); // block size in LBA + blockLBAs = *(u32_le*)(np_header+0x0c); // block size in LBA blockSize = blockLBAs*2048; numBlocks = (lbaSize+blockLBAs-1)/blockLBAs; // total blocks; blockBuf = new u8[blockSize]; tempBuf = new u8[blockSize]; - tableOffset = *(u32*)(np_header+0x6c); // table offset + tableOffset = *(u32_le*)(np_header+0x6c); // table offset tableSize = numBlocks*32; table = new table_info[numBlocks]; diff --git a/Core/FileSystems/BlockDevices.h b/Core/FileSystems/BlockDevices.h index 2db068c77716..cfeefc64936a 100644 --- a/Core/FileSystems/BlockDevices.h +++ b/Core/FileSystems/BlockDevices.h @@ -97,10 +97,10 @@ class FileBlockDevice : public BlockDevice { struct table_info { u8 mac[16]; - u32 offset; - int size; - int flag; - int unk_1c; + u32_le offset; + s32_le size; + s32_le flag; + s32_le unk_1c; }; class NPDRMDemoBlockDevice : public BlockDevice { diff --git a/Core/FileSystems/FileSystem.h b/Core/FileSystems/FileSystem.h index 6c069efe0df5..2486b45fa477 100644 --- a/Core/FileSystems/FileSystem.h +++ b/Core/FileSystems/FileSystem.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "base/basictypes.h" #include "Core/HLE/sceKernel.h" diff --git a/Core/FileSystems/ISOFileSystem.cpp b/Core/FileSystems/ISOFileSystem.cpp index 4484ac811024..dc72d63524a0 100644 --- a/Core/FileSystems/ISOFileSystem.cpp +++ b/Core/FileSystems/ISOFileSystem.cpp @@ -85,7 +85,7 @@ struct DirectoryEntry { u8 identifierLength; //identifier comes right after u8 firstIdChar; -#if COMMON_LITTLE_ENDIAN +#if __LITTLE_ENDIAN__ u32 firstDataSector() const { return firstDataSectorLE; diff --git a/Core/FileSystems/VirtualDiscFileSystem.cpp b/Core/FileSystems/VirtualDiscFileSystem.cpp index df3375fffbe0..ba2dadef7e9e 100644 --- a/Core/FileSystems/VirtualDiscFileSystem.cpp +++ b/Core/FileSystems/VirtualDiscFileSystem.cpp @@ -36,7 +36,7 @@ #include #include #include -#if !PPSSPP_PLATFORM(SWITCH) +#if !PPSSPP_PLATFORM(SWITCH) && !PPSSPP_PLATFORM(WIIU) #include #endif #endif @@ -815,7 +815,7 @@ void VirtualDiscFileSystem::HandlerLogger(void *arg, HandlerHandle handle, LogTy } VirtualDiscFileSystem::Handler::Handler(const char *filename, VirtualDiscFileSystem *const sys) { -#if !PPSSPP_PLATFORM(SWITCH) +#if !PPSSPP_PLATFORM(SWITCH) && !PPSSPP_PLATFORM(WIIU) #ifdef _WIN32 #if PPSSPP_PLATFORM(UWP) #define dlopen(name, ignore) (void *)LoadPackagedLibrary(ConvertUTF8ToWString(name).c_str(), 0) @@ -861,7 +861,7 @@ VirtualDiscFileSystem::Handler::~Handler() { if (library != NULL) { Shutdown(); -#if !PPSSPP_PLATFORM(UWP) && !PPSSPP_PLATFORM(SWITCH) +#if !PPSSPP_PLATFORM(UWP) && !PPSSPP_PLATFORM(SWITCH) && !PPSSPP_PLATFORM(WIIU) #ifdef _WIN32 FreeLibrary((HMODULE)library); #else diff --git a/Core/Font/PGF.cpp b/Core/Font/PGF.cpp index ae3afa57b848..b71179f5acce 100644 --- a/Core/Font/PGF.cpp +++ b/Core/Font/PGF.cpp @@ -41,7 +41,7 @@ static int getBits(int numBits, const u8 *buf, size_t pos) { _dbg_assert_msg_(numBits <= 32, "Unable to return more than 32 bits, %d requested", numBits); const size_t wordpos = pos >> 5; - const u32 *wordbuf = (const u32 *)buf; + const u32_le *wordbuf = (const u32_le *)buf; const u8 bitoff = pos & 31; // Might just be in one, has to be within two. diff --git a/Core/Font/PGF.h b/Core/Font/PGF.h index a4e3f420d880..3a19583c0e63 100644 --- a/Core/Font/PGF.h +++ b/Core/Font/PGF.h @@ -23,6 +23,7 @@ #include #include "Common/CommonTypes.h" +#include "Common/Swap.h" class PointerWrap; @@ -64,7 +65,7 @@ enum Language { FONT_LANGUAGE_CHINESE = 4, }; -enum FontPixelFormat { +enum FontPixelFormat : u32 { PSP_FONT_PIXELFORMAT_4 = 0, // 2 pixels packed in 1 byte (natural order) PSP_FONT_PIXELFORMAT_4_REV = 1, // 2 pixels packed in 1 byte (reversed order) PSP_FONT_PIXELFORMAT_8 = 2, // 1 pixel in 1 byte @@ -109,15 +110,8 @@ struct Glyph { u32 ptr; }; - -#if COMMON_LITTLE_ENDIAN -typedef FontPixelFormat FontPixelFormat_le; -#else -typedef swap_struct_t > FontPixelFormat_le; -#endif - struct GlyphImage { - FontPixelFormat_le pixelFormat; + LEndian pixelFormat; s32_le xPos64; s32_le yPos64; u16_le bufWidth; diff --git a/Core/HLE/FunctionWrappers.h b/Core/HLE/FunctionWrappers.h index 01124ea15735..291bfba64cdb 100644 --- a/Core/HLE/FunctionWrappers.h +++ b/Core/HLE/FunctionWrappers.h @@ -147,7 +147,7 @@ template void WrapI_IIIIIIU() { // Hm, do so many params get passed in registers? template void WrapI_IIIIIIIIU() { - u32 param8 = *(u32*)Memory::GetPointer(currentMIPS->r[29]); //Fixed 9th parameter, thanks to Kingcom + u32 param8 = *(u32_le*)Memory::GetPointer(currentMIPS->r[29]); //Fixed 9th parameter, thanks to Kingcom u32 retval = func(PARAM(0), PARAM(1), PARAM(2), PARAM(3), PARAM(4), PARAM(5), PARAM(6), PARAM(7), param8); RETURN(retval); } diff --git a/Core/HLE/HLE.cpp b/Core/HLE/HLE.cpp index 1634895a403c..dd02a482d93a 100644 --- a/Core/HLE/HLE.cpp +++ b/Core/HLE/HLE.cpp @@ -83,12 +83,12 @@ struct HLEMipsCallStack { u32_le func; u32_le actionIndex; u32_le argc; - }; + }vals; struct { u32_le ra; u32_le v0; u32_le v1; - }; + }regs; }; }; @@ -422,9 +422,9 @@ void hleFlushCalls() { sp -= sizeof(HLEMipsCallStack); stackData.ptr = sp; stackData->nextOff = 0xFFFFFFFF; - stackData->ra = currentMIPS->pc; - stackData->v0 = currentMIPS->r[MIPS_REG_V0]; - stackData->v1 = currentMIPS->r[MIPS_REG_V1]; + stackData->regs.ra = currentMIPS->pc; + stackData->regs.v0 = currentMIPS->r[MIPS_REG_V0]; + stackData->regs.v1 = currentMIPS->r[MIPS_REG_V1]; // Now we'll set up the first in the chain. currentMIPS->pc = enqueuedMipsCalls[0].func; @@ -443,14 +443,14 @@ void hleFlushCalls() { sp -= stackAligned; stackData.ptr = sp; stackData->nextOff = stackAligned; - stackData->func = info.func; + stackData->vals.func = info.func; if (info.action) { - stackData->actionIndex = (int)mipsCallActions.size(); + stackData->vals.actionIndex = (int)mipsCallActions.size(); mipsCallActions.push_back(info.action); } else { - stackData->actionIndex = 0xFFFFFFFF; + stackData->vals.actionIndex = 0xFFFFFFFF; } - stackData->argc = (int)info.args.size(); + stackData->vals.argc = (int)info.args.size(); for (int j = 0; j < (int)info.args.size(); ++j) { Memory::Write_U32(info.args[j], sp + sizeof(HLEMipsCallStack) + j * sizeof(u32)); } @@ -473,9 +473,9 @@ void HLEReturnFromMipsCall() { return; } - if (stackData->actionIndex != 0xFFFFFFFF && stackData->actionIndex < (u32)mipsCallActions.size()) { - PSPAction *&action = mipsCallActions[stackData->actionIndex]; - VERBOSE_LOG(HLE, "Executing action for HLE mips call at %08x, sp=%08x", stackData->func, sp); + if (stackData->vals.actionIndex != 0xFFFFFFFF && stackData->vals.actionIndex < (u32)mipsCallActions.size()) { + PSPAction *&action = mipsCallActions[stackData->vals.actionIndex]; + VERBOSE_LOG(HLE, "Executing action for HLE mips call at %08x, sp=%08x", stackData->vals.func, sp); // Search for the saved v0/v1 values, to preserve the PSPAction API... PSPPointer finalMarker = stackData; @@ -490,11 +490,11 @@ void HLEReturnFromMipsCall() { } MipsCall mc; - mc.savedV0 = finalMarker->v0; - mc.savedV1 = finalMarker->v1; + mc.savedV0 = finalMarker->regs.v0; + mc.savedV1 = finalMarker->regs.v1; action->run(mc); - finalMarker->v0 = mc.savedV0; - finalMarker->v1 = mc.savedV1; + finalMarker->regs.v0 = mc.savedV0; + finalMarker->regs.v1 = mc.savedV1; delete action; action = nullptr; @@ -507,9 +507,9 @@ void HLEReturnFromMipsCall() { if (stackData->nextOff == 0xFFFFFFFF) { // We're done. Grab the HLE result's v0/v1 and return from the syscall. - currentMIPS->pc = stackData->ra; - currentMIPS->r[MIPS_REG_V0] = stackData->v0; - currentMIPS->r[MIPS_REG_V1] = stackData->v1; + currentMIPS->pc = stackData->regs.ra; + currentMIPS->r[MIPS_REG_V0] = stackData->regs.v0; + currentMIPS->r[MIPS_REG_V1] = stackData->regs.v1; sp += sizeof(HLEMipsCallStack); @@ -527,9 +527,9 @@ void HLEReturnFromMipsCall() { // Alright, we have another to call. hleSkipDeadbeef(); - currentMIPS->pc = stackData->func; + currentMIPS->pc = stackData->vals.func; currentMIPS->r[MIPS_REG_RA] = HLEMipsCallReturnAddress(); - for (int i = 0; i < (int)stackData->argc; i++) { + for (int i = 0; i < (int)stackData->vals.argc; i++) { currentMIPS->r[MIPS_REG_A0 + i] = Memory::Read_U32(sp + sizeof(HLEMipsCallStack) + i * sizeof(u32)); } DEBUG_LOG(HLE, "Executing next HLE mips call at %08x, sp=%08x", currentMIPS->pc, sp); diff --git a/Core/HLE/HLEHelperThread.cpp b/Core/HLE/HLEHelperThread.cpp index a142820b9222..0769d965bc4f 100644 --- a/Core/HLE/HLEHelperThread.cpp +++ b/Core/HLE/HLEHelperThread.cpp @@ -29,7 +29,7 @@ HLEHelperThread::HLEHelperThread() : id_(-1), entry_(0) { } -HLEHelperThread::HLEHelperThread(const char *threadName, u32 instructions[], u32 instrCount, u32 prio, int stacksize) { +HLEHelperThread::HLEHelperThread(const char *threadName, u32_le instructions[], u32 instrCount, u32 prio, int stacksize) { u32 instrBytes = instrCount * sizeof(u32); u32 totalBytes = instrBytes + sizeof(u32) * 2; AllocEntry(totalBytes); diff --git a/Core/HLE/HLEHelperThread.h b/Core/HLE/HLEHelperThread.h index 39df4c74025d..c1c57347ad8c 100644 --- a/Core/HLE/HLEHelperThread.h +++ b/Core/HLE/HLEHelperThread.h @@ -25,7 +25,7 @@ class HLEHelperThread { public: // For savestates. HLEHelperThread(); - HLEHelperThread(const char *threadName, u32 instructions[], u32 instrCount, u32 prio, int stacksize); + HLEHelperThread(const char *threadName, u32_le instructions[], u32 instrCount, u32 prio, int stacksize); HLEHelperThread(const char *threadName, const char *module, const char *func, u32 prio, int stacksize); ~HLEHelperThread(); void DoState(PointerWrap &p); diff --git a/Core/HLE/ReplaceTables.cpp b/Core/HLE/ReplaceTables.cpp index 359b76e23ccf..918088701a21 100644 --- a/Core/HLE/ReplaceTables.cpp +++ b/Core/HLE/ReplaceTables.cpp @@ -406,6 +406,9 @@ static int Replace_fabsf() { } static int Replace_vmmul_q_transp() { +#ifdef __BIG_ENDIAN__ + Crash(); // TODO +#endif float *out = (float *)Memory::GetPointer(PARAM(0)); const float *a = (const float *)Memory::GetPointer(PARAM(1)); const float *b = (const float *)Memory::GetPointer(PARAM(2)); @@ -421,8 +424,8 @@ static int Replace_vmmul_q_transp() { // a1 = matrix // a2 = source address static int Replace_gta_dl_write_matrix() { - u32 *ptr = (u32 *)Memory::GetPointer(PARAM(0)); - u32 *src = (u32_le *)Memory::GetPointer(PARAM(2)); + u32_le *ptr = (u32_le *)Memory::GetPointer(PARAM(0)); + u32_le *src = (u32_le *)Memory::GetPointer(PARAM(2)); u32 matrix = PARAM(1) << 24; if (!ptr || !src) { @@ -430,7 +433,7 @@ static int Replace_gta_dl_write_matrix() { return 38; } - u32 *dest = (u32_le *)Memory::GetPointer(ptr[0]); + u32_le *dest = (u32_le *)Memory::GetPointer(ptr[0]); if (!dest) { RETURN(0); return 38; @@ -480,15 +483,15 @@ static int Replace_gta_dl_write_matrix() { // TODO: Inline into a few NEON or SSE instructions - especially if a1 is a known immediate! // Anyway, not sure if worth it. There's not that many matrices written per frame normally. static int Replace_dl_write_matrix() { - u32 *dlStruct = (u32 *)Memory::GetPointer(PARAM(0)); - u32 *src = (u32 *)Memory::GetPointer(PARAM(2)); + u32_le *dlStruct = (u32_le *)Memory::GetPointer(PARAM(0)); + u32_le *src = (u32_le *)Memory::GetPointer(PARAM(2)); if (!dlStruct || !src) { RETURN(0); return 60; } - u32 *dest = (u32 *)Memory::GetPointer(dlStruct[2]); + u32_le *dest = (u32_le *)Memory::GetPointer(dlStruct[2]); if (!dest) { RETURN(0); return 60; diff --git a/Core/HLE/__sceAudio.cpp b/Core/HLE/__sceAudio.cpp index 57ef29281d18..a352938c10a4 100644 --- a/Core/HLE/__sceAudio.cpp +++ b/Core/HLE/__sceAudio.cpp @@ -54,7 +54,7 @@ std::atomic_flag atomicLock_; // We copy samples as they are written into this simple ring buffer. // Might try something more efficient later. -FixedSizeQueue chanSampleQueues[PSP_AUDIO_CHANNEL_MAX + 1]; +FixedSizeQueue chanSampleQueues[PSP_AUDIO_CHANNEL_MAX + 1]; int eventAudioUpdate = -1; int eventHostAudioUpdate = -1; @@ -243,7 +243,7 @@ u32 __AudioEnqueue(AudioChannel &chan, int chanNum, bool blocking) { // Good news: the volume doesn't affect the values at all. // We can just do a direct memory copy. const u32 totalSamples = chan.sampleCount * (chan.format == PSP_AUDIO_FORMAT_STEREO ? 2 : 1); - s16 *buf1 = 0, *buf2 = 0; + s16_le *buf1 = 0, *buf2 = 0; size_t sz1, sz2; chanSampleQueues[chanNum].pushPointers(totalSamples, &buf1, &sz1, &buf2, &sz2); @@ -265,7 +265,7 @@ u32 __AudioEnqueue(AudioChannel &chan, int chanNum, bool blocking) { // Walking a pointer for speed. But let's make sure we wouldn't trip on an invalid ptr. if (Memory::IsValidAddress(chan.sampleAddress + (totalSamples - 1) * sizeof(s16_le))) { - s16 *buf1 = 0, *buf2 = 0; + s16_le *buf1 = 0, *buf2 = 0; size_t sz1, sz2; chanSampleQueues[chanNum].pushPointers(totalSamples, &buf1, &sz1, &buf2, &sz2); AdjustVolumeBlock(buf1, sampleData, sz1, leftVol, rightVol); @@ -337,7 +337,7 @@ void __AudioUpdate(bool resetRecording) { // to the CPU. Much better to throttle the frame rate on frame display and just throw away audio // if the buffer somehow gets full. bool firstChannel = true; - std::vector srcBuffer; + std::vector srcBuffer; for (u32 i = 0; i < PSP_AUDIO_CHANNEL_MAX + 1; i++) { if (!chans[i].reserved) @@ -355,7 +355,7 @@ void __AudioUpdate(bool resetRecording) { ERROR_LOG(SCEAUDIO, "Channel %i buffer underrun at %i of %i", i, (int)chanSampleQueues[i].size() / 2, (int)sz / 2); } - const s16 *buf1 = 0, *buf2 = 0; + const s16_le *buf1 = 0, *buf2 = 0; size_t sz1, sz2; chanSampleQueues[i].popPointers(sz, &buf1, &sz1, &buf2, &sz2); diff --git a/Core/HLE/proAdhoc.cpp b/Core/HLE/proAdhoc.cpp index a0a3e89cad6a..90aecbcfb72e 100644 --- a/Core/HLE/proAdhoc.cpp +++ b/Core/HLE/proAdhoc.cpp @@ -24,11 +24,13 @@ #if !defined(_WIN32) #include #include -#include #include #include +#if !defined(__wiiu__) +#include #include #endif +#endif #include @@ -1069,7 +1071,7 @@ void AfterMatchingMipsCall::run(MipsCall &call) { //call.setReturnValue(v0); } -void AfterMatchingMipsCall::SetData(int ContextID, int eventId, u32_le BufAddr) { +void AfterMatchingMipsCall::SetData(int ContextID, int eventId, u32 BufAddr) { contextID = ContextID; EventID = eventId; bufAddr = BufAddr; @@ -1147,8 +1149,8 @@ void notifyAdhocctlHandlers(u32 flag, u32 error) { void notifyMatchingHandler(SceNetAdhocMatchingContext * context, ThreadMessage * msg, void * opt, u32_le &bufAddr, u32_le &bufLen, u32_le * args) { // Don't share buffer address space with other mipscall in the queue since mipscalls aren't immediately executed MatchingArgs argsNew; - u32_le dataBufLen = msg->optlen + 8; //max(bufLen, msg->optlen + 8); - u32_le dataBufAddr = userMemory.Alloc(dataBufLen); // We will free this memory after returning from mipscall + u32 dataBufLen = msg->optlen + 8; //max(bufLen, msg->optlen + 8); + u32 dataBufAddr = userMemory.Alloc(dataBufLen); // We will free this memory after returning from mipscall uint8_t * dataPtr = Memory::GetPointer(dataBufAddr); memcpy(dataPtr, &msg->mac, sizeof(msg->mac)); if (msg->optlen > 0) diff --git a/Core/HLE/proAdhoc.h b/Core/HLE/proAdhoc.h index fa9d520e3ee8..c7ff22a20c30 100644 --- a/Core/HLE/proAdhoc.h +++ b/Core/HLE/proAdhoc.h @@ -32,17 +32,22 @@ #include #endif -#ifdef _MSC_VER -#define PACK // on MSVC we use #pragma pack() instead so let's kill this. -#else -#define PACK __attribute__((packed)) -#endif - #include #include #include "net/resolve.h" #include "Common/Serialize/Serializer.h" +#include "Common/Swap.h" + +#ifdef _MSC_VER +#define PACK // on MSVC we use #pragma pack() instead so let's kill this. +#elif defined(__BIG_ENDIAN__) +// packed cannot be used with non-POD *_le types. +// TODO: find a real solution for the couple of structs that actually need this +#define PACK +#else +#define PACK __attribute__((packed)) +#endif #include "Core/Config.h" #include "Core/CoreTiming.h" @@ -834,7 +839,7 @@ class AfterMatchingMipsCall : public PSPAction { static PSPAction *Create() { return new AfterMatchingMipsCall(); } void DoState(PointerWrap &p) override; void run(MipsCall &call) override; - void SetData(int ContextID, int eventId, u32_le BufAddr); + void SetData(int ContextID, int eventId, u32 BufAddr); private: int contextID = -1; diff --git a/Core/HLE/sceAtrac.cpp b/Core/HLE/sceAtrac.cpp index bdaeea8b98bf..1ccb07e7e230 100644 --- a/Core/HLE/sceAtrac.cpp +++ b/Core/HLE/sceAtrac.cpp @@ -1226,6 +1226,8 @@ u32 _AtracDecodeData(int atracID, u8 *outbuf, u32 outbufPtr, u32 *SamplesNum, u3 } if (avret < 0) { ERROR_LOG(ME, "swr_convert: Error while converting %d", avret); + } else { + ToLEndian((s16*)out, avret * atrac->outputChannels_); } } #endif // USE_FFMPEG @@ -1569,7 +1571,7 @@ static u32 sceAtracGetNextSample(int atracID, u32 outNAddr) { // Obtains the number of frames remaining in the buffer which can be decoded. // When no more data would be needed, this returns a negative number. static u32 sceAtracGetRemainFrame(int atracID, u32 remainAddr) { - auto remainingFrames = PSPPointer::Create(remainAddr); + auto remainingFrames = PSPPointer::Create(remainAddr); Atrac *atrac = getAtrac(atracID); u32 err = AtracValidateManaged(atrac); @@ -1588,8 +1590,8 @@ static u32 sceAtracGetRemainFrame(int atracID, u32 remainAddr) { } static u32 sceAtracGetSecondBufferInfo(int atracID, u32 fileOffsetAddr, u32 desiredSizeAddr) { - auto fileOffset = PSPPointer::Create(fileOffsetAddr); - auto desiredSize = PSPPointer::Create(desiredSizeAddr); + auto fileOffset = PSPPointer::Create(fileOffsetAddr); + auto desiredSize = PSPPointer::Create(desiredSizeAddr); Atrac *atrac = getAtrac(atracID); u32 err = AtracValidateManaged(atrac); @@ -1624,13 +1626,13 @@ static u32 sceAtracGetSoundSample(int atracID, u32 outEndSampleAddr, u32 outLoop return err; } - auto outEndSample = PSPPointer::Create(outEndSampleAddr); + auto outEndSample = PSPPointer::Create(outEndSampleAddr); if (outEndSample.IsValid()) *outEndSample = atrac->endSample_; - auto outLoopStart = PSPPointer::Create(outLoopStartSampleAddr); + auto outLoopStart = PSPPointer::Create(outLoopStartSampleAddr); if (outLoopStart.IsValid()) *outLoopStart = atrac->loopStartSample_ == -1 ? -1 : atrac->loopStartSample_ - atrac->firstSampleOffset_ - atrac->FirstOffsetExtra(); - auto outLoopEnd = PSPPointer::Create(outLoopEndSampleAddr); + auto outLoopEnd = PSPPointer::Create(outLoopEndSampleAddr); if (outLoopEnd.IsValid()) *outLoopEnd = atrac->loopEndSample_ == -1 ? -1 : atrac->loopEndSample_ - atrac->firstSampleOffset_ - atrac->FirstOffsetExtra(); @@ -2296,7 +2298,7 @@ void _AtracGenerateContext(Atrac *atrac, SceAtracId *context) { context->info.streamDataByte = atrac->first_.size - atrac->dataOff_; u8* buf = (u8*)context; - *(u32*)(buf + 0xfc) = atrac->atracID_; + *(u32_le*)(buf + 0xfc) = atrac->atracID_; } static u32 _sceAtracGetContextAddress(int atracID) { @@ -2418,9 +2420,9 @@ static int sceAtracLowLevelInitDecoder(int atracID, u32 paramsAddr) { static int sceAtracLowLevelDecode(int atracID, u32 sourceAddr, u32 sourceBytesConsumedAddr, u32 samplesAddr, u32 sampleBytesAddr) { auto srcp = PSPPointer::Create(sourceAddr); - auto srcConsumed = PSPPointer::Create(sourceBytesConsumedAddr); + auto srcConsumed = PSPPointer::Create(sourceBytesConsumedAddr); auto outp = PSPPointer::Create(samplesAddr); - auto outWritten = PSPPointer::Create(sampleBytesAddr); + auto outWritten = PSPPointer::Create(sampleBytesAddr); Atrac *atrac = getAtrac(atracID); if (!atrac) { diff --git a/Core/HLE/sceAtrac.h b/Core/HLE/sceAtrac.h index 1f48b48cef7e..7ee384ea0dc2 100644 --- a/Core/HLE/sceAtrac.h +++ b/Core/HLE/sceAtrac.h @@ -42,12 +42,6 @@ enum AtracStatus : u8 { ATRAC_STATUS_STREAMED_MASK = 4, }; -#if COMMON_LITTLE_ENDIAN -typedef AtracStatus AtracStatus_le; -#else -typedef swap_struct_t > AtracStatus_le; -#endif - typedef struct { u32_le decodePos; // 0 @@ -58,7 +52,7 @@ typedef struct char numFrame; // 20 // 2: all the stream data on the buffer // 6: looping -> second buffer needed - AtracStatus_le state; // 21 + AtracStatus state; // 21 char unk22; char numChan; // 23 u16_le sampleSize; // 24 @@ -90,4 +84,4 @@ typedef struct u32 _AtracAddStreamData(int atracID, u32 bufPtr, u32 bytesToAdd); u32 _AtracDecodeData(int atracID, u8* outbuf, u32 outbufPtr, u32 *SamplesNum, u32* finish, int *remains); -int _AtracGetIDByContext(u32 contextAddr); \ No newline at end of file +int _AtracGetIDByContext(u32 contextAddr); diff --git a/Core/HLE/sceAudiocodec.cpp b/Core/HLE/sceAudiocodec.cpp index 21d643e1fc6c..ea7de2ab13f1 100644 --- a/Core/HLE/sceAudiocodec.cpp +++ b/Core/HLE/sceAudiocodec.cpp @@ -116,7 +116,9 @@ static int sceAudiocodecDecode(u32 ctxPtr, int codec) { if (decoder != NULL) { // Decode audio - decoder->Decode(Memory::GetPointer(ctx->inDataPtr), ctx->inDataSize, Memory::GetPointer(ctx->outDataPtr), &outbytes); + u8* outbuf = Memory::GetPointer(ctx->outDataPtr); + decoder->Decode(Memory::GetPointer(ctx->inDataPtr), ctx->inDataSize, outbuf, &outbytes); + ToLEndian((s16*)outbuf, outbytes / 2); } DEBUG_LOG(ME, "sceAudiocodecDec(%08x, %i (%s))", ctxPtr, codec, GetCodecName(codec)); return 0; diff --git a/Core/HLE/sceChnnlsv.cpp b/Core/HLE/sceChnnlsv.cpp index 475e2ed8d2df..dc824a465623 100644 --- a/Core/HLE/sceChnnlsv.cpp +++ b/Core/HLE/sceChnnlsv.cpp @@ -93,11 +93,11 @@ static int typeFromMode(int mode) static int kirkSendCmd(u8* data, int length, int num, bool encrypt) { - *(int*)(data+0) = encrypt ? KIRK_MODE_ENCRYPT_CBC : KIRK_MODE_DECRYPT_CBC; - *(int*)(data+4) = 0; - *(int*)(data+8) = 0; - *(int*)(data+12) = num; - *(int*)(data+16) = length; + *(s32_le*)(data+0) = encrypt ? KIRK_MODE_ENCRYPT_CBC : KIRK_MODE_DECRYPT_CBC; + *(s32_le*)(data+4) = 0; + *(s32_le*)(data+8) = 0; + *(s32_le*)(data+12) = num; + *(s32_le*)(data+16) = length; if (kirk_sceUtilsBufferCopyWithRange(data, length + 20, data, length + 20, encrypt ? KIRK_CMD_ENCRYPT_IV_0 : KIRK_CMD_DECRYPT_IV_0)) return -257; @@ -107,11 +107,11 @@ static int kirkSendCmd(u8* data, int length, int num, bool encrypt) static int kirkSendFuseCmd(u8* data, int length, bool encrypt) { - *(int*)(data+0) = encrypt ? KIRK_MODE_ENCRYPT_CBC : KIRK_MODE_DECRYPT_CBC; - *(int*)(data+4) = 0; - *(int*)(data+8) = 0; - *(int*)(data+12) = 256; - *(int*)(data+16) = length; + *(s32_le*)(data+0) = encrypt ? KIRK_MODE_ENCRYPT_CBC : KIRK_MODE_DECRYPT_CBC; + *(s32_le*)(data+4) = 0; + *(s32_le*)(data+8) = 0; + *(s32_le*)(data+12) = 256; + *(s32_le*)(data+16) = length; // Note: CMD 5 and 8 are not available, will always return -1 if (kirk_sceUtilsBufferCopyWithRange(data, length + 20, data, length + 20, encrypt ? KIRK_CMD_ENCRYPT_IV_FUSE : KIRK_CMD_DECRYPT_IV_FUSE)) @@ -172,7 +172,7 @@ static int sub_0000(u8* data_out, u8* data, int alignedLen, u8* data2, int& data else { memcpy(sp0, sp16, 12); - *(u32*)(sp0+12) = data3-1; + *(u32_le*)(sp0+12) = data3-1; } if (alignedLen > 0) @@ -180,7 +180,7 @@ static int sub_0000(u8* data_out, u8* data, int alignedLen, u8* data2, int& data for(int i = 20; i < alignedLen + 20; i += 16) { memcpy(data_out+i, sp16, 12); - *(u32*)(data_out+12+i) = data3; + *(u32_le*)(data_out+12+i) = data3; data3++; } } diff --git a/Core/HLE/sceDeflt.cpp b/Core/HLE/sceDeflt.cpp index 5c1f57907148..d60104125c4f 100644 --- a/Core/HLE/sceDeflt.cpp +++ b/Core/HLE/sceDeflt.cpp @@ -31,7 +31,7 @@ static int sceDeflateDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer uLong crc; z_stream stream; u8 *outBufferPtr; - u32 *crc32AddrPtr = 0; + u32_le *crc32AddrPtr = 0; if (!Memory::IsValidAddress(OutBuffer) || !Memory::IsValidAddress(InBuffer)) { ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x %08x", OutBuffer, InBuffer); @@ -42,7 +42,7 @@ static int sceDeflateDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x", Crc32Addr); return 0; } - crc32AddrPtr = (u32 *)Memory::GetPointer(Crc32Addr); + crc32AddrPtr = (u32_le *)Memory::GetPointer(Crc32Addr); } outBufferPtr = Memory::GetPointer(OutBuffer); stream.next_in = (Bytef*)Memory::GetPointer(InBuffer); @@ -77,7 +77,7 @@ static int sceGzipDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer, u uLong crc; z_stream stream; u8 *outBufferPtr; - u32 *crc32AddrPtr = 0; + u32_le *crc32AddrPtr = 0; if (!Memory::IsValidAddress(OutBuffer) || !Memory::IsValidAddress(InBuffer)) { ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x %08x", OutBuffer, InBuffer); @@ -88,7 +88,7 @@ static int sceGzipDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer, u ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x", Crc32Addr); return 0; } - crc32AddrPtr = (u32 *)Memory::GetPointer(Crc32Addr); + crc32AddrPtr = (u32_le *)Memory::GetPointer(Crc32Addr); } outBufferPtr = Memory::GetPointer(OutBuffer); stream.next_in = (Bytef*)Memory::GetPointer(InBuffer); @@ -122,7 +122,7 @@ static int sceZlibDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer, u uLong crc; z_stream stream; u8 *outBufferPtr; - u32 *crc32AddrPtr = 0; + u32_le *crc32AddrPtr = 0; if (!Memory::IsValidAddress(OutBuffer) || !Memory::IsValidAddress(InBuffer)) { ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x %08x", OutBuffer, InBuffer); @@ -133,7 +133,7 @@ static int sceZlibDecompress(u32 OutBuffer, int OutBufferLength, u32 InBuffer, u ERROR_LOG(HLE, "sceZlibDecompress: Bad address %08x", Crc32Addr); return 0; } - crc32AddrPtr = (u32 *)Memory::GetPointer(Crc32Addr); + crc32AddrPtr = (u32_le *)Memory::GetPointer(Crc32Addr); } outBufferPtr = Memory::GetPointer(OutBuffer); stream.next_in = (Bytef*)Memory::GetPointer(InBuffer); diff --git a/Core/HLE/sceFont.cpp b/Core/HLE/sceFont.cpp index dbf0bebf3433..1f13e81dd336 100644 --- a/Core/HLE/sceFont.cpp +++ b/Core/HLE/sceFont.cpp @@ -585,7 +585,7 @@ class FontLib { } // For FONT_OPEN_USER* modes, the font will automatically be freed. - LoadedFont *OpenFont(Font *font, FontOpenMode mode, int &error) { + LoadedFont *OpenFont(Font *font, FontOpenMode mode, s32_le &error) { // TODO: Do something with mode, possibly save it where the PSP does in the struct. // Maybe needed in Font, though? Handlers seem... difficult to emulate. int freeFontIndex = -1; @@ -944,7 +944,7 @@ static u32 sceFontNewLib(u32 paramPtr, u32 errorCodePtr) { __LoadInternalFonts(); auto params = PSPPointer::Create(paramPtr); - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!params.IsValid() || !errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontNewLib(%08x, %08x): invalid addresses", paramPtr, errorCodePtr); @@ -979,7 +979,7 @@ static int sceFontDoneLib(u32 fontLibHandle) { // Open internal font into a FontLib static u32 sceFontOpen(u32 libHandle, u32 index, u32 mode, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { // Would crash on the PSP. ERROR_LOG(SCEFONT, "sceFontOpen(%x, %x, %x, %x): invalid pointer", libHandle, index, mode, errorCodePtr); @@ -1009,7 +1009,7 @@ static u32 sceFontOpen(u32 libHandle, u32 index, u32 mode, u32 errorCodePtr) { // Open a user font in RAM into a FontLib static u32 sceFontOpenUserMemory(u32 libHandle, u32 memoryFontAddrPtr, u32 memoryFontLength, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontOpenUserMemory(%08x, %08x, %08x, %08x): invalid error address", libHandle, memoryFontAddrPtr, memoryFontLength, errorCodePtr); return -1; @@ -1054,7 +1054,7 @@ static u32 sceFontOpenUserMemory(u32 libHandle, u32 memoryFontAddrPtr, u32 memor // Open a user font in a file into a FontLib static u32 sceFontOpenUserFile(u32 libHandle, const char *fileName, u32 mode, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontOpenUserFile(%08x, %s, %08x, %08x): invalid error address", libHandle, fileName, mode, errorCodePtr); @@ -1114,7 +1114,7 @@ static int sceFontClose(u32 fontHandle) { } static int sceFontFindOptimumFont(u32 libHandle, u32 fontStylePtr, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontFindOptimumFont(%08x, %08x, %08x): invalid error address", libHandle, fontStylePtr, errorCodePtr); return SCE_KERNEL_ERROR_INVALID_ARGUMENT; @@ -1139,8 +1139,8 @@ static int sceFontFindOptimumFont(u32 libHandle, u32 fontStylePtr, u32 errorCode auto requestedStyle = PSPPointer::Create(fontStylePtr); // Find the first nearest match for H/V, OR the last exact match for others. - float hRes = requestedStyle->fontHRes > 0.0f ? requestedStyle->fontHRes : fontLib->FontHRes(); - float vRes = requestedStyle->fontVRes > 0.0f ? requestedStyle->fontVRes : fontLib->FontVRes(); + float hRes = requestedStyle->fontHRes > 0.0f ? (float)requestedStyle->fontHRes : fontLib->FontHRes(); + float vRes = requestedStyle->fontVRes > 0.0f ? (float)requestedStyle->fontVRes : fontLib->FontVRes(); Font *optimumFont = 0; Font *nearestFont = 0; float nearestDist = std::numeric_limits::infinity(); @@ -1182,7 +1182,7 @@ static int sceFontFindOptimumFont(u32 libHandle, u32 fontStylePtr, u32 errorCode // Returns the font index, not handle static int sceFontFindFont(u32 libHandle, u32 fontStylePtr, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontFindFont(%x, %x, %x): invalid error address", libHandle, fontStylePtr, errorCodePtr); return SCE_KERNEL_ERROR_INVALID_ARGUMENT; @@ -1206,8 +1206,8 @@ static int sceFontFindFont(u32 libHandle, u32 fontStylePtr, u32 errorCodePtr) { auto requestedStyle = PSPPointer::Create(fontStylePtr); // Find the closest exact match for the fields specified. - float hRes = requestedStyle->fontHRes > 0.0f ? requestedStyle->fontHRes : fontLib->FontHRes(); - float vRes = requestedStyle->fontVRes > 0.0f ? requestedStyle->fontVRes : fontLib->FontVRes(); + float hRes = requestedStyle->fontHRes > 0.0f ? (float)requestedStyle->fontHRes : fontLib->FontHRes(); + float vRes = requestedStyle->fontVRes > 0.0f ? (float)requestedStyle->fontVRes : fontLib->FontVRes(); for (size_t i = 0; i < internalFonts.size(); i++) { if (internalFonts[i]->MatchesStyle(*requestedStyle) != MATCH_NONE) { auto matchStyle = internalFonts[i]->GetFontStyle(); @@ -1462,7 +1462,7 @@ static int sceFontGetFontList(u32 fontLibHandle, u32 fontStylePtr, int numFonts) } static int sceFontGetNumFontList(u32 fontLibHandle, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontGetNumFontList(%08x, %08x): invalid error address", fontLibHandle, errorCodePtr); return ERROR_FONT_INVALID_PARAMETER; @@ -1494,7 +1494,7 @@ static int sceFontSetResolution(u32 fontLibHandle, float hRes, float vRes) { } static float sceFontPixelToPointH(int fontLibHandle, float fontPixelsH, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontPixelToPointH(%08x, %f, %08x): invalid error address", fontLibHandle, fontPixelsH, errorCodePtr); return 0.0f; @@ -1511,7 +1511,7 @@ static float sceFontPixelToPointH(int fontLibHandle, float fontPixelsH, u32 erro } static float sceFontPixelToPointV(int fontLibHandle, float fontPixelsV, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontPixelToPointV(%08x, %f, %08x): invalid error address", fontLibHandle, fontPixelsV, errorCodePtr); return 0.0f; @@ -1528,7 +1528,7 @@ static float sceFontPixelToPointV(int fontLibHandle, float fontPixelsV, u32 erro } static float sceFontPointToPixelH(int fontLibHandle, float fontPointsH, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontPointToPixelH(%08x, %f, %08x): invalid error address", fontLibHandle, fontPointsH, errorCodePtr); return 0.0f; @@ -1545,7 +1545,7 @@ static float sceFontPointToPixelH(int fontLibHandle, float fontPointsH, u32 erro } static float sceFontPointToPixelV(int fontLibHandle, float fontPointsV, u32 errorCodePtr) { - auto errorCode = PSPPointer::Create(errorCodePtr); + auto errorCode = PSPPointer::Create(errorCodePtr); if (!errorCode.IsValid()) { ERROR_LOG_REPORT(SCEFONT, "sceFontPointToPixelV(%08x, %f, %08x): invalid error address", fontLibHandle, fontPointsV, errorCodePtr); return 0.0f; diff --git a/Core/HLE/sceGe.cpp b/Core/HLE/sceGe.cpp index 0cdae03b8a32..7fa67205e541 100644 --- a/Core/HLE/sceGe.cpp +++ b/Core/HLE/sceGe.cpp @@ -376,7 +376,7 @@ static int sceGeListUpdateStallAddr(u32 displayListID, u32 stallAddress) { hleEatCycles(190); CoreTiming::ForceCheck(); - DEBUG_LOG(SCEGE, "sceGeListUpdateStallAddr(dlid=%i, stalladdr=%08x)", displayListID, stallAddress); + DEBUG_LOG(SCEGE, "sceGeListUpdateStallAddr(dlid=%08x, stalladdr=%08x)", displayListID, stallAddress); return gpu->UpdateStall(LIST_ID_MAGIC ^ displayListID, stallAddress); } diff --git a/Core/HLE/sceIo.cpp b/Core/HLE/sceIo.cpp index 9316bae16f33..e1f56d45b921 100644 --- a/Core/HLE/sceIo.cpp +++ b/Core/HLE/sceIo.cpp @@ -2465,9 +2465,9 @@ static int __IoIoctl(u32 id, u32 cmd, u32 indataPtr, u32 inlen, u32 outdataPtr, // Even if the size is 4, it still actually reads a 16 byte struct, it seems. if (Memory::IsValidAddress(indataPtr) && inlen >= 4) { struct SeekInfo { - u64 offset; - u32 unk; - u32 whence; + u64_le offset; + u32_le unk; + u32_le whence; }; const auto seekInfo = PSPPointer::Create(indataPtr); FileMove seek; @@ -2569,9 +2569,9 @@ static int __IoIoctl(u32 id, u32 cmd, u32 indataPtr, u32 inlen, u32 outdataPtr, if (Memory::IsValidAddress(indataPtr) && inlen >= 4) { struct SeekInfo { - u64 offset; - u32 unk; - u32 whence; + u64_le offset; + u32_le unk; + u32_le whence; }; const auto seekInfo = PSPPointer::Create(indataPtr); FileMove seek; diff --git a/Core/HLE/sceJpeg.cpp b/Core/HLE/sceJpeg.cpp index aa6d554ded1a..95c3effe1d58 100644 --- a/Core/HLE/sceJpeg.cpp +++ b/Core/HLE/sceJpeg.cpp @@ -76,7 +76,7 @@ static void __JpegCsc(u32 imageAddr, u32 yCbCrAddr, int widthHeight, int bufferW int width = (widthHeight >> 16) & 0xFFF; int lineWidth = std::min(width, bufferWidth); int skipEndOfLine = std::max(0, bufferWidth - lineWidth); - u32 *imageBuffer = (u32*)Memory::GetPointer(imageAddr); + u32_le *imageBuffer = (u32_le*)Memory::GetPointer(imageAddr); int sizeY = width * height; int sizeCb = sizeY >> 2; u8 *Y = (u8*)Memory::GetPointer(yCbCrAddr); @@ -142,7 +142,7 @@ static int __DecodeJpeg(u32 jpegAddr, int jpegSize, u32 imageAddr) { if (actual_components == 3) { u24_be *imageBuffer = (u24_be*)jpegBuf; - u32 *abgr = (u32*)Memory::GetPointer(imageAddr); + u32_le *abgr = (u32_le*)Memory::GetPointer(imageAddr); int pspWidth = 0; for (int w = 2; w <= 4096; w *= 2) { if (w >= width && w >= height) { diff --git a/Core/HLE/sceKernel.h b/Core/HLE/sceKernel.h index c9d4e20d4c11..3c59adbb0c7e 100644 --- a/Core/HLE/sceKernel.h +++ b/Core/HLE/sceKernel.h @@ -499,7 +499,7 @@ class KernelObjectPool { } } - int ListIDType(int type, SceUID *uids, int count) const { + int ListIDType(int type, SceUID_le *uids, int count) const { int total = 0; for (int i = 0; i < maxCount; i++) { if (!occupied[i]) { diff --git a/Core/HLE/sceKernelEventFlag.cpp b/Core/HLE/sceKernelEventFlag.cpp index b4c28732b6ed..ae12691f51c9 100644 --- a/Core/HLE/sceKernelEventFlag.cpp +++ b/Core/HLE/sceKernelEventFlag.cpp @@ -59,9 +59,9 @@ class EventFlag : public KernelObject { static const char *GetStaticTypeName() { return "EventFlag"; } void GetQuickInfo(char *ptr, int size) override { sprintf(ptr, "init=%08x cur=%08x numwait=%i", - nef.initPattern, - nef.currentPattern, - nef.numWaitThreads); + (u32)nef.initPattern, + (u32)nef.currentPattern, + (s32)nef.numWaitThreads); } static u32 GetMissingErrorCode() { diff --git a/Core/HLE/sceKernelModule.cpp b/Core/HLE/sceKernelModule.cpp index 0b2b55ae74d1..475f6f706ae5 100644 --- a/Core/HLE/sceKernelModule.cpp +++ b/Core/HLE/sceKernelModule.cpp @@ -254,8 +254,8 @@ class PSPModule : public KernelObject { sprintf(ptr, "%sname=%s gp=%08x entry=%08x", isFake ? "faked " : "", nm.name, - nm.gp_value, - nm.entry_addr); + (u32)nm.gp_value, + (u32)nm.entry_addr); } static u32 GetMissingErrorCode() { return SCE_KERNEL_ERROR_UNKNOWN_MODULE; } static int GetStaticIDType() { return PPSSPP_KERNEL_TMID_Module; } @@ -1060,7 +1060,7 @@ static bool KernelImportModuleFuncs(PSPModule *module, u32 *firstImportStubAddr, } snprintf(temp, sizeof(temp), "%s ver=%04x, flags=%04x, size=%d, numVars=%d, numFuncs=%d, nidData=%08x, firstSym=%08x, varData=%08x, extra=%08x\n", - modulename, entry->version, entry->flags, entry->size, entry->numVars, entry->numFuncs, entry->nidData, entry->firstSymAddr, entry->size >= 6 ? entry->varData : 0, entry->size >= 7 ? entry->extra : 0); + modulename, (u16)entry->version, (u16)entry->flags, entry->size, entry->numVars, (u16)entry->numFuncs, (u32)entry->nidData, (u32)entry->firstSymAddr, entry->size >= 6 ? (u32)entry->varData : 0, entry->size >= 7 ? (u32)entry->extra : 0); debugInfo += temp; } @@ -1401,11 +1401,11 @@ static PSPModule *__KernelLoadELFFromPtr(const u8 *ptr, size_t elfSize, u32 load name = "invalid?"; } - INFO_LOG(LOADER, "Exporting ent %d named %s, %d funcs, %d vars, resident %08x", m, name, ent->fcount, ent->vcount, ent->resident); + INFO_LOG(LOADER, "Exporting ent %d named %s, %d funcs, %d vars, resident %08x", m, name, (u32)ent->fcount, (u32)ent->vcount, (u32)ent->resident); if (!Memory::IsValidAddress(ent->resident)) { if (ent->fcount + variableCount > 0) { - WARN_LOG_REPORT(LOADER, "Invalid export resident address %08x", ent->resident); + WARN_LOG_REPORT(LOADER, "Invalid export resident address %08x", (u32)ent->resident); } continue; } @@ -1414,7 +1414,7 @@ static PSPModule *__KernelLoadELFFromPtr(const u8 *ptr, size_t elfSize, u32 load u32_le *exportPtr = residentPtr + ent->fcount + variableCount; if (ent->size != 4 && ent->unknown1 != 0 && ent->unknown2 != 0) { - WARN_LOG_REPORT(LOADER, "Unexpected export module entry size %d, vcountNew=%08x, unknown1=%08x, unknown2=%08x", ent->size, ent->vcountNew, ent->unknown1, ent->unknown2); + WARN_LOG_REPORT(LOADER, "Unexpected export module entry size %d, vcountNew=%08x, unknown1=%08x, unknown2=%08x", ent->size, (u32)ent->vcountNew, ent->unknown1, ent->unknown2); } FuncSymbolExport func; diff --git a/Core/HLE/sceKernelMutex.cpp b/Core/HLE/sceKernelMutex.cpp index 1b6b31476432..7a35835bf954 100644 --- a/Core/HLE/sceKernelMutex.cpp +++ b/Core/HLE/sceKernelMutex.cpp @@ -1089,7 +1089,7 @@ static int __KernelReferLwMutexStatus(SceUID uid, u32 infoPtr) // Refresh and write m->nm.currentCount = workarea->lockLevel; - m->nm.lockThread = workarea->lockThread == 0 ? -1 : workarea->lockThread; + m->nm.lockThread = workarea->lockThread == 0 ? -1 : (SceUID)workarea->lockThread; m->nm.numWaitThreads = (int) m->waitingThreads.size(); Memory::WriteStruct(infoPtr, &m->nm); } diff --git a/Core/HLE/sceKernelThread.cpp b/Core/HLE/sceKernelThread.cpp index 0e096111c03f..12ec8452c1a7 100644 --- a/Core/HLE/sceKernelThread.cpp +++ b/Core/HLE/sceKernelThread.cpp @@ -147,8 +147,8 @@ class PSPCallback : public KernelObject { void GetQuickInfo(char *ptr, int size) override { sprintf(ptr, "thread=%i, argument= %08x", //hackAddress, - nc.threadId, - nc.commonArgument); + (s32)nc.threadId, + (u32)nc.commonArgument); } ~PSPCallback() { @@ -177,19 +177,13 @@ class PSPCallback : public KernelObject { NativeCallback nc; }; -#if COMMON_LITTLE_ENDIAN -typedef WaitType WaitType_le; -#else -typedef swap_struct_t > WaitType_le; -#endif - // Real PSP struct, don't change the fields. struct SceKernelThreadRunStatus { SceSize_le size; u32_le status; s32_le currentPriority; - WaitType_le waitType; + LEndian waitType; SceUID_le waitID; s32_le wakeupCount; SceKernelSysClock runForClocks; @@ -214,7 +208,7 @@ struct NativeThread s32_le initialPriority; s32_le currentPriority; - WaitType_le waitType; + LEndian waitType; SceUID_le waitID; s32_le wakeupCount; s32_le exitStatus; @@ -392,7 +386,7 @@ class PSPThread : public KernelObject { (nt.status & THREADSTATUS_DORMANT) ? "DORMANT" : "", (nt.status & THREADSTATUS_DEAD) ? "DEAD" : "", (int)nt.waitType, - nt.waitID, + (int)nt.waitID, waitInfo.waitValue); } @@ -1193,7 +1187,7 @@ void __KernelThreadingShutdown() { std::string __KernelThreadingSummary() { PSPThread *t = __GetCurrentThread(); - return StringFromFormat("Cur thread: %s (attr %08x)", t ? t->GetName() : "(null)", t ? t->nt.attr : 0); + return StringFromFormat("Cur thread: %s (attr %08x)", t ? t->GetName() : "(null)", t ? (u32)t->nt.attr : 0); } const char *__KernelGetThreadName(SceUID threadID) @@ -1401,7 +1395,7 @@ u32 sceKernelGetThreadmanIdList(u32 type, u32 readBufPtr, u32 readBufSize, u32 i } u32 total = 0; - auto uids = PSPPointer::Create(readBufPtr); + auto uids = PSPPointer::Create(readBufPtr); u32 error; if (type > 0 && type <= SCE_KERNEL_TMID_Tlspl) { DEBUG_LOG(SCEKERNEL, "sceKernelGetThreadmanIdList(%i, %08x, %i, %08x)", type, readBufPtr, readBufSize, idCountPtr); @@ -2022,7 +2016,7 @@ int __KernelStartThread(SceUID threadToStartID, int argSize, u32 argBlockPtr, bo return error; PSPThread *cur = __GetCurrentThread(); - __KernelResetThread(startThread, cur ? cur->nt.currentPriority : 0); + __KernelResetThread(startThread, cur ? (s32)cur->nt.currentPriority : 0); u32 &sp = startThread->context.r[MIPS_REG_SP]; // Force args means just use those as a0/a1 without any special treatment. @@ -3667,12 +3661,12 @@ int LoadExecForUser_362A956B() static const SceUID SCE_TE_THREADID_ALL_USER = 0xFFFFFFF0; struct NativeThreadEventHandler { - u32 size; + u32_le size; char name[KERNELOBJECT_MAX_NAME_LENGTH + 1]; - SceUID threadID; - u32 mask; - u32 handlerPtr; - u32 commonArg; + SceUID_le threadID; + u32_le mask; + u32_le handlerPtr; + u32_le commonArg; }; struct ThreadEventHandler : public KernelObject { diff --git a/Core/HLE/sceMp3.cpp b/Core/HLE/sceMp3.cpp index b3bdcb9b1860..7e47b8af72f2 100644 --- a/Core/HLE/sceMp3.cpp +++ b/Core/HLE/sceMp3.cpp @@ -367,7 +367,7 @@ static int FindMp3Header(AuCtx *ctx, int &header, int end) { for (int offset = 0; offset < end; ++offset) { // If we hit valid sync bits, then we've found a header. if (ptr[offset] == 0xFF && (ptr[offset + 1] & 0xC0) == 0xC0) { - header = bswap32(Memory::Read_U32(addr + offset)); + header = swap32(Memory::Read_U32(addr + offset)); return offset; } } @@ -685,6 +685,7 @@ static u32 sceMp3LowLevelDecode(u32 mp3, u32 sourceAddr, u32 sourceBytesConsumed int outpcmbytes = 0; ctx->decoder->Decode((void*)inbuff, 4096, outbuff, &outpcmbytes); + ToLEndian((s16*)outbuff, outpcmbytes / 2); Memory::Write_U32(ctx->decoder->GetSourcePos(), sourceBytesConsumedAddr); Memory::Write_U32(outpcmbytes, sampleBytesAddr); diff --git a/Core/HLE/sceMpeg.cpp b/Core/HLE/sceMpeg.cpp index 6a10183fab96..91a2b884f465 100644 --- a/Core/HLE/sceMpeg.cpp +++ b/Core/HLE/sceMpeg.cpp @@ -111,10 +111,10 @@ static AVPixelFormat pmp_want_pix_fmt; struct SceMpegLLI { - u32 pSrc; - u32 pDst; - u32 Next; - int iSize; + u32_le pSrc; + u32_le pDst; + u32_le Next; + s32_le iSize; }; void SceMpegAu::read(u32 addr) { @@ -306,8 +306,8 @@ static void AnalyzeMpeg(u8 *buffer, u32 validSize, MpegContext *ctx) { ctx->mpegMagic = *(u32_le*)buffer; ctx->mpegRawVersion = *(u32_le*)(buffer + PSMF_STREAM_VERSION_OFFSET); ctx->mpegVersion = getMpegVersion(ctx->mpegRawVersion); - ctx->mpegOffset = bswap32(*(u32_le*)(buffer + PSMF_STREAM_OFFSET_OFFSET)); - ctx->mpegStreamSize = bswap32(*(u32_le*)(buffer + PSMF_STREAM_SIZE_OFFSET)); + ctx->mpegOffset = swap32(*(u32_le*)(buffer + PSMF_STREAM_OFFSET_OFFSET)); + ctx->mpegStreamSize = swap32(*(u32_le*)(buffer + PSMF_STREAM_SIZE_OFFSET)); ctx->mpegFirstTimestamp = getMpegTimeStamp(buffer + PSMF_FIRST_TIMESTAMP_OFFSET); ctx->mpegLastTimestamp = getMpegTimeStamp(buffer + PSMF_LAST_TIMESTAMP_OFFSET); ctx->mpegFirstDate = convertTimestampToDate(ctx->mpegFirstTimestamp); diff --git a/Core/HLE/sceNet.h b/Core/HLE/sceNet.h index 8374a878ff06..87860e3eaa98 100644 --- a/Core/HLE/sceNet.h +++ b/Core/HLE/sceNet.h @@ -171,8 +171,8 @@ typedef struct SceNetApctlInfoInternal { // Using struct instead of union for in char name[APCTL_PROFILENAME_MAXLEN]; u8 bssid[ETHER_ADDR_LEN]; char ssid[APCTL_SSID_MAXLEN]; - unsigned int ssidLength; // ssid string length (excluding null terminator) - unsigned int securityType; // a value of PSP_NET_APCTL_INFO_SECURITY_TYPE_NONE..PSP_NET_APCTL_INFO_SECURITY_TYPE_WPA? + u32_le ssidLength; // ssid string length (excluding null terminator) + u32_le securityType; // a value of PSP_NET_APCTL_INFO_SECURITY_TYPE_NONE..PSP_NET_APCTL_INFO_SECURITY_TYPE_WPA? u8 strength; // Signal strength in % u8 channel; u8 powerSave; // 1 on, 0 off @@ -181,12 +181,12 @@ typedef struct SceNetApctlInfoInternal { // Using struct instead of union for in char gateway[APCTL_IPADDR_MAXLEN]; char primaryDns[APCTL_IPADDR_MAXLEN]; char secondaryDns[APCTL_IPADDR_MAXLEN]; - unsigned int useProxy; // 1 for proxy, 0 for no proxy + u32_le useProxy; // 1 for proxy, 0 for no proxy char proxyUrl[APCTL_URL_MAXLEN]; - unsigned short proxyPort; - unsigned int eapType; // 0 is none, 1 is EAP-MD5 - unsigned int startBrowser; // 1 = start browser - unsigned int wifisp; // 1 if connection is for Wifi service providers (WISP) for sharing internet connection + u16_le proxyPort; + u32_le eapType; // 0 is none, 1 is EAP-MD5 + u32_le startBrowser; // 1 = start browser + u32_le wifisp; // 1 if connection is for Wifi service providers (WISP) for sharing internet connection } SceNetApctlInfoInternal; struct ApctlHandler { @@ -195,7 +195,7 @@ struct ApctlHandler { }; struct ApctlArgs { - u32_le data[5]; // OldState, NewState, Event, Error, ArgsAddr + u32 data[5]; // OldState, NewState, Event, Error, ArgsAddr }; class PointerWrap; diff --git a/Core/HLE/sceNetAdhoc.cpp b/Core/HLE/sceNetAdhoc.cpp index 242b0047b0b2..b321a2e165ae 100644 --- a/Core/HLE/sceNetAdhoc.cpp +++ b/Core/HLE/sceNetAdhoc.cpp @@ -1229,8 +1229,8 @@ static int sceNetAdhocPdpRecv(int id, void *addr, void * port, void *buf, void * } SceNetEtherAddr *saddr = (SceNetEtherAddr *)addr; - uint16_t * sport = (uint16_t *)port; //Looking at Quake3 sourcecode (net_adhoc.c) this is an "int" (32bit) but changing here to 32bit will cause FF-Type0 to see duplicated Host (thinking it was from a different host) - int * len = (int *)dataLength; + u16_le * sport = (u16_le *)port; //Looking at Quake3 sourcecode (net_adhoc.c) this is an "int" (32bit) but changing here to 32bit will cause FF-Type0 to see duplicated Host (thinking it was from a different host) + s32_le * len = (s32_le *)dataLength; if (netAdhocInited) { // Valid Socket ID if (id >= PdpIdStart && id < PdpIdEnd && pdp[id - PdpIdStart] != NULL) { @@ -2716,14 +2716,14 @@ static int sceNetAdhocPtpAccept(int id, u32 peerMacAddrPtr, u32 peerPortPtr, int if (Memory::IsValidAddress(peerMacAddrPtr)) { addr = PSPPointer::Create(peerMacAddrPtr); } - uint16_t * port = NULL; // + u16_le * port = NULL; // if (Memory::IsValidAddress(peerPortPtr)) { - port = (uint16_t *)Memory::GetPointer(peerPortPtr); + port = (u16_le *)Memory::GetPointer(peerPortPtr); } if (flag == 0) { // Prevent spamming Debug Log with retries of non-bocking socket - DEBUG_LOG(SCENET, "sceNetAdhocPtpAccept(%d, [%08x]=%s, [%08x]=%u, %d, %u) at %08x", id, peerMacAddrPtr, mac2str(addr).c_str(), peerPortPtr, port ? *port : -1, timeout, flag, currentMIPS->pc); + DEBUG_LOG(SCENET, "sceNetAdhocPtpAccept(%d, [%08x]=%s, [%08x]=%u, %d, %u) at %08x", id, peerMacAddrPtr, mac2str(addr).c_str(), peerPortPtr, port ? (int)*port : -1, timeout, flag, currentMIPS->pc); } else { - VERBOSE_LOG(SCENET, "sceNetAdhocPtpAccept(%d, [%08x]=%s, [%08x]=%u, %d, %u) at %08x", id, peerMacAddrPtr, mac2str(addr).c_str(), peerPortPtr, port ? *port : -1, timeout, flag, currentMIPS->pc); + VERBOSE_LOG(SCENET, "sceNetAdhocPtpAccept(%d, [%08x]=%s, [%08x]=%u, %d, %u) at %08x", id, peerMacAddrPtr, mac2str(addr).c_str(), peerPortPtr, port ? (int)*port : -1, timeout, flag, currentMIPS->pc); } if (!g_Config.bEnableWlan) { return 0; @@ -3122,7 +3122,7 @@ static int sceNetAdhocPtpSend(int id, u32 dataAddr, u32 dataSizeAddr, int timeou if (!g_Config.bEnableWlan) { return 0; } - int * len = (int *)Memory::GetPointer(dataSizeAddr); + s32_le * len = (s32_le *)Memory::GetPointer(dataSizeAddr); const char * data = Memory::GetCharPointer(dataAddr); // Library is initialized if (netAdhocInited) { @@ -3211,7 +3211,7 @@ static int sceNetAdhocPtpRecv(int id, u32 dataAddr, u32 dataSizeAddr, int timeou return 0; } void * buf = (void *)Memory::GetPointer(dataAddr); - int * len = (int *)Memory::GetPointer(dataSizeAddr); + s32_le * len = (s32_le *)Memory::GetPointer(dataSizeAddr); // Library is initialized if (netAdhocInited) { // Valid Socket @@ -4583,7 +4583,7 @@ void __NetTriggerCallbacks() { u32 flags = params->first; u32 error = params->second; - u32_le args[3] = { 0, 0, 0 }; + u32 args[3] = { 0, 0, 0 }; args[0] = flags; args[1] = error; @@ -4645,7 +4645,9 @@ void __NetMatchingCallbacks() //(int matchingId) auto params = matchingEvents.begin(); if (params != matchingEvents.end()) { - u32_le* args = (u32_le*)&(*params); + u32 args[6]; + for (int i = 0; i < 6; i++) + args[i] = params->data[i]; //auto context = findMatchingContext(args[0]); if (actionAfterMatchingMipsCall < 0) { @@ -4739,7 +4741,7 @@ static int sceNetAdhocctlGetPeerList(u32 sizeAddr, u32 bufAddr) { SceNetAdhocctlPeerInfoEmu *buf = NULL; if (Memory::IsValidAddress(bufAddr)) buf = (SceNetAdhocctlPeerInfoEmu *)Memory::GetPointer(bufAddr); - DEBUG_LOG(SCENET, "sceNetAdhocctlGetPeerList([%08x]=%i, %08x) at %08x", sizeAddr, /*buflen ? *buflen : -1*/Memory::Read_U32(sizeAddr), bufAddr, currentMIPS->pc); + DEBUG_LOG(SCENET, "sceNetAdhocctlGetPeerList([%08x]=%i, %08x) at %08x", sizeAddr, /*buflen ? (s32)*buflen : -1*/Memory::Read_U32(sizeAddr), bufAddr, currentMIPS->pc); if (!g_Config.bEnableWlan) { return -1; } @@ -4831,7 +4833,7 @@ static int sceNetAdhocctlGetAddrByName(const char *nickName, u32 sizeAddr, u32 b memcpy(nckName, nickName, ADHOCCTL_NICKNAME_LEN); // Copied to null-terminated var to prevent unexpected behaviour on Logs nckName[ADHOCCTL_NICKNAME_LEN - 1] = 0; - WARN_LOG(SCENET, "UNTESTED sceNetAdhocctlGetAddrByName(%s, [%08x]=%d/%zu, %08x)", nckName, sizeAddr, buflen ? *buflen : -1, sizeof(SceNetAdhocctlPeerInfoEmu), bufAddr); + WARN_LOG(SCENET, "UNTESTED sceNetAdhocctlGetAddrByName(%s, [%08x]=%d/%zu, %08x)", nckName, sizeAddr, buflen ? (s32)*buflen : -1, sizeof(SceNetAdhocctlPeerInfoEmu), bufAddr); // Library initialized if (netAdhocctlInited) @@ -5980,8 +5982,8 @@ int matchingEventThread(int matchingId) // Run while needed... if (context != NULL) { - u32 bufLen = context->rxbuflen; //0; - u32 bufAddr = 0; //= userMemory.Alloc(bufLen); //context->rxbuf; + u32_le bufLen = (u32)context->rxbuflen; //0; + u32_le bufAddr = 0; //= userMemory.Alloc(bufLen); //context->rxbuf; u32_le * args = context->handlerArgs; //MatchingArgs while (contexts != NULL && context->eventRunning) @@ -6229,7 +6231,7 @@ int matchingInputThread(int matchingId) // TODO: The MatchingInput thread is usi SceNetAdhocctlPeerInfo* peer = findFriend(&sendermac); if (peer != NULL) { now = CoreTiming::GetGlobalTimeUsScaled(); - u64_le delta = now - peer->last_recv; + u64 delta = now - peer->last_recv; DEBUG_LOG(SCENET, "Timestamp Delta: %llu (%llu - %llu) from %s", delta, now, peer->last_recv, mac2str(&sendermac).c_str()); if (/*context->rxbuf[0] > 0 &&*/ peer->last_recv != 0) peer->last_recv = now - 1; // - context->keepalive_int; // May need to deduce by ping interval to prevent Dissidia 012 unable to see other players (ie. disappearing issue) } diff --git a/Core/HLE/sceNp.cpp b/Core/HLE/sceNp.cpp index cd0fc9a26b02..00530d3a4aed 100644 --- a/Core/HLE/sceNp.cpp +++ b/Core/HLE/sceNp.cpp @@ -244,7 +244,7 @@ static int sceNpAuthCreateStartRequest(u32 paramAddr) // 2nd Arg seems to be used if not a negative number and exits the handler if it's negative (error code?) // 3rd Arg seems to be a data (ie. 92 bytes of data?) pointer and tested for null within callback handler (optional callback args?) u32 ticketLength = 248; // default ticket length? should be updated using the ticket length returned from login - notifyNpAuthHandlers(retval, ticketLength, (params.size >= 36) ? params.cbArgAddr : 0); + notifyNpAuthHandlers(retval, ticketLength, (params.size >= 36) ? (u32)params.cbArgAddr : 0); } //hleDelayResult(0, "give time", 500000); diff --git a/Core/HLE/sceNp.h b/Core/HLE/sceNp.h index 3c117b33c71d..0ac2d76029dd 100644 --- a/Core/HLE/sceNp.h +++ b/Core/HLE/sceNp.h @@ -173,7 +173,7 @@ struct NpAuthHandler { }; struct NpAuthArgs { - u32_le data[3]; // id, result, ArgAddr + u32 data[3]; // id, result, ArgAddr }; using SceNpAuthCallback = s32(s32 id, s32 result, PSPPointer arg); @@ -187,7 +187,7 @@ struct SceNpAuthRequestParameter u32_le cookieSize; u32_le entitlementIdAddr; //PSPPointer entitlementId; // null-terminated string u32_le consumedCount; // related to entitlement? - u32 ticketCbAddr; //PSPPointer ticketCb + u32_le ticketCbAddr; //PSPPointer ticketCb u32_le cbArgAddr; //PSPPointer cbArg }; diff --git a/Core/HLE/sceP3da.cpp b/Core/HLE/sceP3da.cpp index 1595dc139e2f..98b5d46e556f 100644 --- a/Core/HLE/sceP3da.cpp +++ b/Core/HLE/sceP3da.cpp @@ -48,11 +48,11 @@ static u32 sceP3daBridgeCore(u32 p3daCoreAddr, u32 channelsNum, u32 samplesNum, DEBUG_LOG(SCEAUDIO, "sceP3daBridgeCore(%08x, %08x, %08x, %08x, %08x)", p3daCoreAddr, channelsNum, samplesNum, inputAddr, outputAddr); if (Memory::IsValidAddress(inputAddr) && Memory::IsValidAddress(outputAddr)) { int scaleval = getScaleValue(channelsNum); - s16* outbuf = (s16*)Memory::GetPointer(outputAddr); + s16_le* outbuf = (s16_le*)Memory::GetPointer(outputAddr); memset(outbuf, 0, samplesNum * sizeof(s16) * 2); for (u32 k = 0; k < channelsNum; k++) { u32 inaddr = Memory::Read_U32(inputAddr + k * 4); - s16 *inbuf = (s16*)Memory::GetPointer(inaddr); + s16_le *inbuf = (s16_le*)Memory::GetPointer(inaddr); if (!inbuf) continue; for (u32 i = 0; i < samplesNum; i++) { diff --git a/Core/HLE/scePsmf.cpp b/Core/HLE/scePsmf.cpp index 0a601db8d434..770ab7507ad7 100644 --- a/Core/HLE/scePsmf.cpp +++ b/Core/HLE/scePsmf.cpp @@ -107,8 +107,8 @@ struct PsmfData { struct PsmfPlayerCreateData { PSPPointer buffer; - u32 bufferSize; - int threadPriority; + u32_le bufferSize; + s32_le threadPriority; }; struct PsmfPlayerData { @@ -889,7 +889,7 @@ static u32 scePsmfQueryStreamOffset(u32 bufferAddr, u32 offsetAddr) { WARN_LOG(ME, "scePsmfQueryStreamOffset(%08x, %08x)", bufferAddr, offsetAddr); if (Memory::IsValidAddress(offsetAddr)) { - Memory::Write_U32(bswap32(Memory::Read_U32(bufferAddr + PSMF_STREAM_OFFSET_OFFSET)), offsetAddr); + Memory::Write_U32(swap32(Memory::Read_U32(bufferAddr + PSMF_STREAM_OFFSET_OFFSET)), offsetAddr); } return 0; } @@ -898,7 +898,7 @@ static u32 scePsmfQueryStreamSize(u32 bufferAddr, u32 sizeAddr) { WARN_LOG(ME, "scePsmfQueryStreamSize(%08x, %08x)", bufferAddr, sizeAddr); if (Memory::IsValidAddress(sizeAddr)) { - Memory::Write_U32(bswap32(Memory::Read_U32(bufferAddr + PSMF_STREAM_SIZE_OFFSET)), sizeAddr); + Memory::Write_U32(swap32(Memory::Read_U32(bufferAddr + PSMF_STREAM_SIZE_OFFSET)), sizeAddr); } return 0; } @@ -1084,7 +1084,7 @@ static u32 scePsmfGetEPidWithTimestamp(u32 psmfStruct, u32 ts) static int scePsmfPlayerCreate(u32 psmfPlayer, u32 dataPtr) { - auto player = PSPPointer::Create(psmfPlayer); + auto player = PSPPointer::Create(psmfPlayer); const auto data = PSPPointer::Create(dataPtr); if (!player.IsValid() || !data.IsValid()) { diff --git a/Core/HLE/sceRtc.cpp b/Core/HLE/sceRtc.cpp index 5fc8d372ce7a..7fb0c606df01 100644 --- a/Core/HLE/sceRtc.cpp +++ b/Core/HLE/sceRtc.cpp @@ -502,7 +502,7 @@ static int sceRtcConvertLocalTimeToUTC(u32 tickLocalPtr,u32 tickUTCPtr) long timezone_val; _get_timezone(&timezone_val); srcTick -= -timezone_val * 1000000ULL; -#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) +#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) && !defined(__wiiu__) time_t timezone = 0; tm *time = localtime(&timezone); srcTick -= time->tm_gmtoff*1000000ULL; @@ -527,7 +527,7 @@ static int sceRtcConvertUtcToLocalTime(u32 tickUTCPtr,u32 tickLocalPtr) long timezone_val; _get_timezone(&timezone_val); srcTick += -timezone_val * 1000000ULL; -#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) +#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) && !defined(__wiiu__) time_t timezone = 0; tm *time = localtime(&timezone); srcTick += time->tm_gmtoff*1000000ULL; @@ -993,7 +993,7 @@ static int __RtcFormatRFC2822(u32 outPtr, u32 srcTickPtr, int tz) char *out = (char *)Memory::GetPointer(outPtr); char *end = out + 32; out += strftime(out, end - out, "%a, %d %b ", &local); - out += snprintf(out, end - out, "%04d", pt.year); + out += snprintf(out, end - out, "%04d", (s16)pt.year); out += strftime(out, end - out, " %H:%M:%S ", &local); if (tz < 0) out += snprintf(out, end - out, "-%02d%02d", -tz / 60, -tz % 60); @@ -1023,7 +1023,7 @@ static int __RtcFormatRFC3339(u32 outPtr, u32 srcTickPtr, int tz) char *out = (char *)Memory::GetPointer(outPtr); char *end = out + 32; - out += snprintf(out, end - out, "%04d", pt.year); + out += snprintf(out, end - out, "%04d", (u16)pt.year); out += strftime(out, end - out, "-%m-%dT%H:%M:%S.00", &local); if (tz == 0) out += snprintf(out, end - out, "Z"); @@ -1062,7 +1062,7 @@ static int sceRtcFormatRFC2822LocalTime(u32 outPtr, u32 srcTickPtr) long timezone_val; _get_timezone(&timezone_val); tz_seconds = -timezone_val; -#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) +#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) && !defined(__wiiu__) time_t timezone = 0; tm *time = localtime(&timezone); tz_seconds = time->tm_gmtoff; @@ -1099,7 +1099,7 @@ static int sceRtcFormatRFC3339LocalTime(u32 outPtr, u32 srcTickPtr) long timezone_val; _get_timezone(&timezone_val); tz_seconds = -timezone_val; -#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) +#elif !defined(_AIX) && !defined(__sgi) && !defined(__hpux) && !defined(HAVE_LIBNX) && !defined(__wiiu__) time_t timezone = 0; tm *time = localtime(&timezone); tz_seconds = time->tm_gmtoff; diff --git a/Core/HLE/sceUsbGps.h b/Core/HLE/sceUsbGps.h index 24c2f5ccd043..c6b63fa15cb2 100644 --- a/Core/HLE/sceUsbGps.h +++ b/Core/HLE/sceUsbGps.h @@ -28,35 +28,35 @@ void __UsbGpsShutdown(); #pragma pack(push, 1) typedef struct { - short year; - short month; - short date; - short hour; - short minute; - short second; - float garbage1; - float hdop; - float garbage2; - float latitude; - float longitude; - float altitude; - float garbage3; - float speed; - float bearing; + s16_le year; + s16_le month; + s16_le date; + s16_le hour; + s16_le minute; + s16_le second; + float_le garbage1; + float_le hdop; + float_le garbage2; + float_le latitude; + float_le longitude; + float_le altitude; + float_le garbage3; + float_le speed; + float_le bearing; } GpsData; typedef struct { unsigned char id; unsigned char elevation; - short azimuth; + s16_le azimuth; unsigned char snr; unsigned char good; - short garbage; + s16_le garbage; } SatInfo; typedef struct { - short satellites_in_view; - short garbage; + s16_le satellites_in_view; + s16_le garbage; SatInfo satInfo[24]; } SatData; diff --git a/Core/HW/MediaEngine.cpp b/Core/HW/MediaEngine.cpp index 6b1d646bd122..8514bbdf0436 100644 --- a/Core/HW/MediaEngine.cpp +++ b/Core/HW/MediaEngine.cpp @@ -953,6 +953,8 @@ int MediaEngine::getAudioSamples(u32 bufferPtr) { if (!m_audioContext->Decode(audioFrame, frameSize, buffer, &outbytes)) { ERROR_LOG(ME, "Audio (%s) decode failed during video playback", GetCodecName(m_audioType)); + } else { + ToLEndian((s16*)buffer, outbytes / 2); } CBreakPoints::ExecMemCheck(bufferPtr, true, outbytes, currentMIPS->pc); diff --git a/Core/HW/MpegDemux.cpp b/Core/HW/MpegDemux.cpp index da121f72e0be..a77641794e59 100644 --- a/Core/HW/MpegDemux.cpp +++ b/Core/HW/MpegDemux.cpp @@ -202,6 +202,7 @@ bool MpegDemux::demux(int audioChannel) } // Not enough data available yet. if (m_readSize - m_index < 16) { + looksValid = true; m_index -= 4; break; } diff --git a/Core/HW/SasAudio.cpp b/Core/HW/SasAudio.cpp index 913c6948cf56..507d620ecc53 100644 --- a/Core/HW/SasAudio.cpp +++ b/Core/HW/SasAudio.cpp @@ -116,7 +116,7 @@ void VagDecoder::DecodeBlock(u8 *&read_pointer) { read_pointer = readp; } -void VagDecoder::GetSamples(s16 *outSamples, int numSamples) { +void VagDecoder::GetSamples(s16_le *outSamples, int numSamples) { if (end_) { memset(outSamples, 0, numSamples * sizeof(s16)); return; @@ -194,7 +194,7 @@ int SasAtrac3::setContext(u32 context) { return 0; } -void SasAtrac3::getNextSamples(s16 *outbuf, int wantedSamples) { +void SasAtrac3::getNextSamples(s16_le *outbuf, int wantedSamples) { if (atracID_ < 0) { end_ = true; return; @@ -204,7 +204,7 @@ void SasAtrac3::getNextSamples(s16 *outbuf, int wantedSamples) { while (!finish && sampleQueue_->getQueueSize() < wantedbytes) { u32 numSamples = 0; int remains = 0; - static s16 buf[0x800]; + static s16_le buf[0x800]; _AtracDecodeData(atracID_, (u8*)buf, 0, &numSamples, &finish, &remains); if (numSamples > 0) sampleQueue_->push((u8*)buf, numSamples * sizeof(s16)); @@ -419,7 +419,7 @@ int SasInstance::EstimateMixUs() { return std::min(cycles, 1200); } -void SasVoice::ReadSamples(s16 *output, int numSamples) { +void SasVoice::ReadSamples(s16_le *output, int numSamples) { // Read N samples into the resample buffer. Could do either PCM or VAG here. switch (type) { case VOICETYPE_VAG: @@ -428,7 +428,7 @@ void SasVoice::ReadSamples(s16 *output, int numSamples) { case VOICETYPE_PCM: { int needed = numSamples; - s16 *out = output; + s16_le *out = output; while (needed > 0) { u32 size = std::min(pcmSize - pcmIndex, needed); if (!on) { @@ -529,7 +529,7 @@ void SasInstance::MixVoice(SasVoice &voice) { const bool needsInterp = voicePitch != PSP_SAS_PITCH_BASE || (sampleFrac & PSP_SAS_PITCH_MASK) != 0; for (int i = delay; i < grainSize; i++) { - const int16_t *s = mixTemp_ + (sampleFrac >> PSP_SAS_PITCH_BASE_SHIFT); + const s16_le *s = mixTemp_ + (sampleFrac >> PSP_SAS_PITCH_BASE_SHIFT); // Linear interpolation. Good enough. Need to make resampleHist bigger if we want more. int sample = s[0]; @@ -587,16 +587,16 @@ void SasInstance::Mix(u32 outAddr, u32 inAddr, int leftVol, int rightVol) { // Then mix the send buffer in with the rest. // Alright, all voices mixed. Let's convert and clip, and at the same time, wipe mixBuffer for next time. Could also dither. - s16 *outp = (s16 *)Memory::GetPointer(outAddr); - const s16 *inp = inAddr ? (s16*)Memory::GetPointer(inAddr) : 0; + s16_le *outp = (s16_le *)Memory::GetPointer(outAddr); + const s16_le *inp = inAddr ? (s16_le *)Memory::GetPointer(inAddr) : 0; if (outputMode == PSP_SAS_OUTPUTMODE_MIXED) { // Okay, apply effects processing to the Send buffer. WriteMixedOutput(outp, inp, leftVol, rightVol); } else { - s16 *outpL = outp + grainSize * 0; - s16 *outpR = outp + grainSize * 1; - s16 *outpSendL = outp + grainSize * 2; - s16 *outpSendR = outp + grainSize * 3; + s16_le *outpL = outp + grainSize * 0; + s16_le *outpR = outp + grainSize * 1; + s16_le *outpSendL = outp + grainSize * 2; + s16_le *outpSendR = outp + grainSize * 3; WARN_LOG_REPORT_ONCE(sasraw, SASMIX, "sceSasCore: raw outputMode"); for (int i = 0; i < grainSize * 2; i += 2) { *outpL++ = clamp_s16(mixBuffer[i + 0]); @@ -613,7 +613,7 @@ void SasInstance::Mix(u32 outAddr, u32 inAddr, int leftVol, int rightVol) { #endif } -void SasInstance::WriteMixedOutput(s16 *outp, const s16 *inp, int leftVol, int rightVol) { +void SasInstance::WriteMixedOutput(s16_le *outp, const s16_le *inp, int leftVol, int rightVol) { const bool dry = waveformEffect.isDryOn != 0; const bool wet = waveformEffect.isWetOn != 0; if (wet) { diff --git a/Core/HW/SasAudio.h b/Core/HW/SasAudio.h index 480250dc705c..7476cffb0d16 100644 --- a/Core/HW/SasAudio.h +++ b/Core/HW/SasAudio.h @@ -100,7 +100,7 @@ class VagDecoder { } void Start(u32 dataPtr, u32 vagSize, bool loopEnabled); - void GetSamples(s16 *outSamples, int numSamples); + void GetSamples(s16_le *outSamples, int numSamples); void DecodeBlock(u8 *&readp); bool End() const { return end_; } @@ -133,7 +133,7 @@ class SasAtrac3 { SasAtrac3() : contextAddr_(0), atracID_(-1), sampleQueue_(0), end_(false) {} ~SasAtrac3() { if (sampleQueue_) delete sampleQueue_; } int setContext(u32 context); - void getNextSamples(s16 *outbuf, int wantedSamples); + void getNextSamples(s16_le *outbuf, int wantedSamples); int addStreamData(u32 bufPtr, u32 addbytes); void DoState(PointerWrap &p); bool End() const { @@ -235,7 +235,7 @@ struct SasVoice { void DoState(PointerWrap &p); - void ReadSamples(s16 *output, int numSamples); + void ReadSamples(s16_le *output, int numSamples); bool HaveSamplesEnded() const; bool playing; @@ -264,7 +264,7 @@ struct SasVoice { // volume to "Send" (audio-lingo) to the effects processing engine, like reverb int effectLeft; int effectRight; - s16 resampleHist[2]; + s16_le resampleHist[2]; ADSREnvelope envelope; @@ -299,7 +299,7 @@ class SasInstance { // Applies reverb to send buffer, according to waveformEffect. void ApplyWaveformEffect(); void SetWaveformEffectType(int type); - void WriteMixedOutput(s16 *outp, const s16 *inp, int leftVol, int rightVol); + void WriteMixedOutput(s16_le *outp, const s16_le *inp, int leftVol, int rightVol); void GetDebugText(char *text, size_t bufsize); @@ -311,5 +311,5 @@ class SasInstance { private: SasReverb reverb_; int grainSize; - int16_t mixTemp_[PSP_SAS_MAX_GRAIN * 4 + 2 + 8]; // some extra margin for very high pitches. + s16_le mixTemp_[PSP_SAS_MAX_GRAIN * 4 + 2 + 8]; // some extra margin for very high pitches. }; diff --git a/Core/HW/SimpleAudioDec.cpp b/Core/HW/SimpleAudioDec.cpp index b6a51438c90a..0c1908754f61 100644 --- a/Core/HW/SimpleAudioDec.cpp +++ b/Core/HW/SimpleAudioDec.cpp @@ -339,7 +339,8 @@ u32 AuCtx::AuDecode(u32 pcmAddr) { if (!sourcebuff.empty()) { // FFmpeg doesn't seem to search for a sync for us, so let's do that. int nextSync = (int)FindNextMp3Sync(); - decoder->Decode(&sourcebuff[nextSync], (int)sourcebuff.size() - nextSync, outbuf, &outpcmbufsize); + decoder->Decode(&sourcebuff[nextSync], (int)sourcebuff.size() - nextSync, outbuf, &outpcmbufsize); + ToLEndian((u16*)outbuf, outpcmbufsize / 2); if (outpcmbufsize == 0) { // Nothing was output, hopefully we're at the end of the stream. diff --git a/Core/Instance.cpp b/Core/Instance.cpp index 8084c2ff257a..e489329586ac 100644 --- a/Core/Instance.cpp +++ b/Core/Instance.cpp @@ -79,7 +79,7 @@ static bool UpdateInstanceCounter(void (*callback)(volatile InstanceInfo *)) { UnmapViewOfFile(buf); return result; -#elif PPSSPP_PLATFORM(ANDROID) || defined(__LIBRETRO__) +#elif PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(WIIU) || defined(__LIBRETRO__) // TODO: replace shm_open & shm_unlink with ashmem or android-shmem return false; #else @@ -140,7 +140,7 @@ void InitInstanceCounter() { PPSSPP_ID = 1; return; } -#elif PPSSPP_PLATFORM(ANDROID) || defined(__LIBRETRO__) +#elif PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(WIIU) || defined(__LIBRETRO__) // TODO : replace shm_open & shm_unlink with ashmem or android-shmem #else // Create shared memory object @@ -177,7 +177,7 @@ void ShutdownInstanceCounter() { CloseHandle(mapLock); mapLock = nullptr; } -#elif PPSSPP_PLATFORM(ANDROID) || defined(__LIBRETRO__) +#elif PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(WIIU) || defined(__LIBRETRO__) // Do nothing #else if (hIDMapFile >= 0) { diff --git a/Core/Loaders.cpp b/Core/Loaders.cpp index 97eca751eece..627fb019e1c6 100644 --- a/Core/Loaders.cpp +++ b/Core/Loaders.cpp @@ -125,7 +125,7 @@ IdentifiedFileType Identify_File(FileLoader *fileLoader) { return IdentifiedFileType::ERROR_IDENTIFYING; } - u32 psar_offset = 0, psar_id = 0; + u32_le psar_offset = 0, psar_id = 0; u32 _id = id; if (!memcmp(&_id, "PK\x03\x04", 4) || !memcmp(&_id, "PK\x05\x06", 4) || !memcmp(&_id, "PK\x07\x08", 4)) { return IdentifiedFileType::ARCHIVE_ZIP; diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 63a1d5718924..ad7cccda2c87 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -616,45 +616,47 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { case IROp::Mult: { s64 result = (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } case IROp::MultU: { u64 result = (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } case IROp::Madd: { - s64 result; - memcpy(&result, &mips->lo, 8); + s64 result = (s64)((u64)mips->lo | ((u64)mips->hi << 32)); result += (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } case IROp::MaddU: { - s64 result; - memcpy(&result, &mips->lo, 8); + s64 result = (s64)((u64)mips->lo | ((u64)mips->hi << 32)); result += (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } case IROp::Msub: { - s64 result; - memcpy(&result, &mips->lo, 8); + s64 result = (s64)((u64)mips->lo | ((u64)mips->hi << 32)); result -= (s64)(s32)mips->r[inst->src1] * (s64)(s32)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } case IROp::MsubU: { - s64 result; - memcpy(&result, &mips->lo, 8); + s64 result = (s64)((u64)mips->lo | ((u64)mips->hi << 32)); result -= (u64)mips->r[inst->src1] * (u64)mips->r[inst->src2]; - memcpy(&mips->lo, &result, 8); + mips->lo = (u32)result; + mips->hi = (u32)(result >> 32); break; } diff --git a/Core/MIPS/JitCommon/JitBlockCache.cpp b/Core/MIPS/JitCommon/JitBlockCache.cpp index 9807ce955128..ad392a852980 100644 --- a/Core/MIPS/JitCommon/JitBlockCache.cpp +++ b/Core/MIPS/JitCommon/JitBlockCache.cpp @@ -660,6 +660,8 @@ JitBlockDebugInfo JitBlockCache::GetBlockDebugInfo(int blockNum) const { debugInfo.targetDisasm = DisassembleArm64(block->normalEntry, block->codeSize); #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) debugInfo.targetDisasm = DisassembleX86(block->normalEntry, block->codeSize); +#elif defined(__wiiu__) + debugInfo.targetDisasm = DisassemblePPC(block->normalEntry, block->codeSize); #endif return debugInfo; diff --git a/Core/MIPS/JitCommon/JitCommon.cpp b/Core/MIPS/JitCommon/JitCommon.cpp index 397cc85ad9b5..ee975bfd5a86 100644 --- a/Core/MIPS/JitCommon/JitCommon.cpp +++ b/Core/MIPS/JitCommon/JitCommon.cpp @@ -31,6 +31,10 @@ #include "Core/MIPS/JitCommon/JitState.h" #include "Core/MIPS/IR/IRJit.h" +#if PPSSPP_PLATFORM(WIIU) +#include +#endif + #if PPSSPP_ARCH(ARM) #include "../ARM/ArmJit.h" #elif PPSSPP_ARCH(ARM64) @@ -39,6 +43,8 @@ #include "../x86/Jit.h" #elif PPSSPP_ARCH(MIPS) #include "../MIPS/MipsJit.h" +#elif PPSSPP_ARCH(POWERPC) +#include "../PPC/PpcJit.h" #else #include "../fake/FakeJit.h" #endif @@ -68,7 +74,7 @@ namespace MIPSComp { return new MIPSComp::ArmJit(mips); #elif PPSSPP_ARCH(ARM64) return new MIPSComp::Arm64Jit(mips); -#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) +#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(POWERPC) return new MIPSComp::Jit(mips); #elif PPSSPP_ARCH(MIPS) return new MIPSComp::MipsJit(mips); @@ -270,3 +276,30 @@ std::vector DisassembleX86(const u8 *data, int size) { } #endif + +#if PPSSPP_PLATFORM(WIIU) +static std::vector lines; +static void PPCLineCallback(const char* fmt, ...) { + char* line = nullptr; + + va_list args; + va_start(args, fmt); + vasprintf(&line, fmt, args); + va_end(args); + + if(line) { + lines.push_back(line); + free(line); + } +} +std::vector DisassemblePPC(const u8 *data, int size) { + lines.clear(); + size >>= 2; + while (size-- > 0) { + DisassemblePPCRange(data, data, PPCLineCallback, nullptr, DISASM_FLAG_SIMPLE | DISASM_FLAG_SHORT); + data += 4; + lines.back().pop_back(); + } + return lines; +} +#endif diff --git a/Core/MIPS/JitCommon/JitCommon.h b/Core/MIPS/JitCommon/JitCommon.h index 08063f866e49..b71203a3fd7f 100644 --- a/Core/MIPS/JitCommon/JitCommon.h +++ b/Core/MIPS/JitCommon/JitCommon.h @@ -28,7 +28,9 @@ std::vector DisassembleArm2(const u8 *data, int size); std::vector DisassembleArm64(const u8 *data, int size); std::vector DisassembleX86(const u8 *data, int size); - +#ifdef __wiiu__ +std::vector DisassemblePPC(const u8 *data, int size); +#endif struct JitBlock; class JitBlockCache; class JitBlockCacheDebugInterface; diff --git a/Core/MIPS/MIPSAnalyst.cpp b/Core/MIPS/MIPSAnalyst.cpp index 10b972028190..8c5760ea36ab 100644 --- a/Core/MIPS/MIPSAnalyst.cpp +++ b/Core/MIPS/MIPSAnalyst.cpp @@ -679,7 +679,8 @@ namespace MIPSAnalyst { int vt = (((op >> 16) & 0x1f)) | ((op & 1) << 5); float rd[4]; ReadVector(rd, V_Quad, vt); - return memcmp(rd, Memory::GetPointer(addr), sizeof(float) * 4) != 0; + float_le* ptr = (float_le *)Memory::GetPointer(addr); + return (rd[0] != ptr[0]) || (rd[1] != ptr[1]) || (rd[2] != ptr[2]) || (rd[3] != ptr[3]); } // TODO: Technically, the break might be for 1 byte in the middle of a sw. diff --git a/Core/MIPS/MIPSIntVFPU.cpp b/Core/MIPS/MIPSIntVFPU.cpp index bc8d698c59a0..83953ef208cb 100644 --- a/Core/MIPS/MIPSIntVFPU.cpp +++ b/Core/MIPS/MIPSIntVFPU.cpp @@ -243,7 +243,7 @@ namespace MIPSInt { _dbg_assert_msg_( 0, "Misaligned lv.q at %08x (pc = %08x)", addr, PC); } -#ifndef COMMON_BIG_ENDIAN +#ifndef __BIG_ENDIAN__ WriteVector((const float*)Memory::GetPointer(addr), V_Quad, vt); #else float lvqd[4]; @@ -290,7 +290,7 @@ namespace MIPSInt { _dbg_assert_msg_( 0, "Misaligned sv.q at %08x (pc = %08x)", addr, PC); } -#ifndef COMMON_BIG_ENDIAN +#ifndef __BIG_ENDIAN__ ReadVector(reinterpret_cast(Memory::GetPointer(addr)), V_Quad, vt); #else float svqd[4]; diff --git a/Core/MIPS/PPC/PpcAsm.cpp b/Core/MIPS/PPC/PpcAsm.cpp new file mode 100644 index 000000000000..905b115cfea1 --- /dev/null +++ b/Core/MIPS/PPC/PpcAsm.cpp @@ -0,0 +1,292 @@ + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/Serialize/Serializer.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" +#include "Core/MIPS/JitCommon/JitCommon.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +//#include + +using namespace PpcGen; + +extern volatile CoreState coreState; + +namespace MIPSComp +{ +static int dontLogBlocks = 20; +static int logBlocks = 40; + +const u8 *Jit::DoJit(u32 em_address, JitBlock *b) +{ + js.cancel = false; + js.blockStart = js.compilerPC = mips_->pc; + js.downcountAmount = 0; + js.curBlock = b; + js.compiling = true; + js.inDelaySlot = false; + js.PrefixStart(); + + // We add a check before the block, used when entering from a linked block. + b->checkedEntry = (u8*)GetCodePtr(); + // Downcount flag check. The last block decremented downcounter, and the flag should still be available. + + MOVI2R(SREG, js.blockStart); + + // if (currentMIPS->downcount<0) + CMPI(DCNTREG, 0); + BLT((const void *)outerLoopPCInR0); + + b->normalEntry = GetCodePtr(); + // TODO: this needs work + MIPSAnalyst::AnalysisResults analysis; // = MIPSAnalyst::Analyze(em_address); + + gpr.Start(analysis); + fpr.Start(analysis); + + int numInstructions = 0; + int cycles = 0; + int partialFlushOffset = 0; + if (logBlocks > 0) logBlocks--; + if (dontLogBlocks > 0) dontLogBlocks--; + +// #define LOGASM +#ifdef LOGASM + char temp[256]; +#endif + while (js.compiling) + { + gpr.SetCompilerPC(js.compilerPC); // Let it know for log messages + fpr.SetCompilerPC(js.compilerPC); + MIPSOpcode inst = Memory::Read_Instruction(js.compilerPC); + js.downcountAmount += MIPSGetInstructionCycleEstimate(inst); + + MIPSCompileOp(inst, this); + + js.compilerPC += 4; + numInstructions++; + } +#ifdef LOGASM + if (logBlocks > 0 && dontLogBlocks == 0) { + for (u32 cpc = em_address; cpc != js.compilerPC + 4; cpc += 4) { + MIPSDisAsm(Memory::Read_Instruction(cpc), cpc, temp, true); + INFO_LOG(JIT, "M: %08x %s", cpc, temp); + } + } +#endif + + b->codeSize = GetCodePtr() - b->normalEntry; + +#if defined(LOGASM) && defined(__wiiu__) + if (logBlocks > 0 && dontLogBlocks == 0) { + INFO_LOG(JIT, "=============== ARM ==============="); + DisassemblePPC(b->normalEntry, GetCodePtr() - b->normalEntry); + } +#endif + //DumpJit(); + + AlignCode16(); + + // Don't forget to zap the instruction cache! + FlushIcache(); + + b->originalSize = numInstructions; + return b->normalEntry; +} + +void Jit::DumpJit() { +#if defined(_XBOX) || defined(__wiiu__) +#ifdef _XBOX + const char* filename = "game:\\jit.bin"; +#else + const char* filename = "sd:/jit.bin"; +#endif + u32 len = (u32)GetCodePtr() - (u32)GetBasePtr(); + FILE * fd; + fd = fopen(filename, "wb"); + fwrite(GetBasePtr(), len, 1, fd); + fclose(fd); +#endif +} + +void Jit::GenerateFixedCode() { + enterCode = AlignCode16(); + + INFO_LOG(JIT, "Base: %08x", (u32)Memory::base); + INFO_LOG(JIT, "enterCode: 0x%08x", (u32)enterCode); + INFO_LOG(JIT, "GetBasePtr: 0x%08x", (u32)GetBasePtr()); + + Prologue(); + + // Map fixed register + MOVI2R(BASEREG, (u32)Memory::base); + MOVI2R(CTXREG, (u32)mips_); + MOVI2R(CODEREG, (u32)GetBasePtr()); + + // Update downcount reg value from memory + RestoreDowncount(DCNTREG); + + // SREG = mips->pc + MovFromPC(SREG); + + // Keep current location, TODO rename it, outerLoopPCInR0 to outerLoopPCInR3 ?? + outerLoopPCInR0 = GetCodePtr(); + + // mips->pc = SREG + MovToPC(SREG); + + // Keep current location + outerLoop = GetCodePtr(); + + // Jit loop + // { + // Save downcount reg value to memory + SaveDowncount(DCNTREG); + // Call CoreTiming::Advance() => update donwcount + QuickCallFunction((void *)&CoreTiming::Advance); + // Update downcount reg value from memory + RestoreDowncount(DCNTREG); + + // branch to skipToRealDispatch + FixupBranch skipToRealDispatch = B(); //skip the sync and compare first time + + // Keep current location dispatcherCheckCoreState: + dispatcherCheckCoreState = GetCodePtr(); + + // The result of slice decrementation should be in flags if somebody jumped here + // IMPORTANT - We jump on negative, not carry!!! + // branch to bailCoreState: (jump if(what ??) negative ) + FixupBranch bailCoreState = BLT(); // BLT ??? + + // SREG = coreState + MOVI2R(SREG, (u32)&coreState); + // Compare coreState and CORE_RUNNING + LWZ(SREG, SREG); // SREG = *SREG + CMPI(SREG, 0); // compare 0(CORE_RUNNING) and CR0 + + // branch to badCoreState: (jump if coreState != CORE_RUNNING) + FixupBranch badCoreState = BNE(); + + // branch to skipToRealDispatch2: + FixupBranch skipToRealDispatch2 = B(); //skip the sync and compare first time + + // Keep current location, TODO rename it, outerLoopPCInR0 to outerLoopPCInSREG ?? + dispatcherPCInR0 = GetCodePtr(); + + // mips->pc = SREG + MovToPC(SREG); + + // At this point : flags = EQ. Fine for the next check, no need to jump over it. + // label dispatcher: + dispatcher = GetCodePtr(); + + // { + // The result of slice decrementation should be in flags if somebody jumped here + // IMPORTANT - We jump on negative, not carry!!! + // label bail: + // arm B_CC(CC_MI); + FixupBranch bail = BLT(); + + // label skipToRealDispatch: + SetJumpTarget(skipToRealDispatch); + + // label skipToRealDispatch2: + SetJumpTarget(skipToRealDispatch2); + + // Keep current location + dispatcherNoCheck = GetCodePtr(); + + // read op + // R3 = mips->pc & Memory::MEMVIEW32_MASK + LWZ(R3, CTXREG, offsetof(MIPSState, pc)); + // & Memory::MEMVIEW32_MASK + RLWINM(R3, R3, 0, 2, 31); + + // R3 = memory::base[r3]; + ADD(R3, BASEREG, R3); + MOVI2R(R0, 0); + LWBRX(R3, R3, R0); + + // R4 = R3 & MIPS_EMUHACK_VALUE_MASK + RLWINM(R4, R3, 0, 8, 31); + + // R3 = R3 & MIPS_EMUHACK_MASK + RLWINM(R3, R3, 0, 0, 5); + + // compare, op == MIPS_EMUHACK_OPCODE + MOVI2R(SREG, MIPS_EMUHACK_OPCODE); + CMPL(R3, SREG); + + // Branch if func block not found + FixupBranch notfound = BNE(); + + // { + // R3 = R4 + GetBasePtr() + ADD(R3, R4, CODEREG); + + MTCTR(R3); + BCTR(); + // } + + // label notfound: + SetJumpTarget(notfound); + + //Ok, no block, let's jit + // Save downcount reg value to memory + SaveDowncount(DCNTREG); + + // Exec JitAt => Compile block ! + QuickCallFunction((void *)&JitAt); + + // Update downcount reg value from memory + RestoreDowncount(DCNTREG); + + // branch to dispatcherNoCheck: + B(dispatcherNoCheck); // no point in special casing this + // } + + // label bail: + SetJumpTarget(bail); + + // label bailCoreState: + SetJumpTarget(bailCoreState); + + // Compare coreState and CORE_RUNNING + MOVI2R(SREG, (u32)&coreState); + LWZ(SREG, SREG); // SREG = *SREG => SREG = coreState + CMPLI(SREG, 0); // compare 0(CORE_RUNNING) and corestate + + BEQ(outerLoop); + // } + + // badCoreState label: + SetJumpTarget(badCoreState); + + // Keep current location + breakpointBailout = GetCodePtr(); + + // mips->downcount = DCNTREG + SaveDowncount(DCNTREG); + + Epilogue(); + + // Go back to caller + BLR(); + + // Don't forget to zap the instruction cache! + FlushIcache(); +} + +} + +#endif diff --git a/Core/MIPS/PPC/PpcCompAlu.cpp b/Core/MIPS/PPC/PpcCompAlu.cpp new file mode 100644 index 000000000000..3ef758476856 --- /dev/null +++ b/Core/MIPS/PPC/PpcCompAlu.cpp @@ -0,0 +1,568 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/Serialize/Serializer.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +/*************************************************************************************************** +* Current issues: +* Comp_RType3(min/max): Can't select start in disgaea +* Comp_ShiftType(srl/srlv?): Crash ridge racer 2 +***************************************************************************************************/ + + +using namespace MIPSAnalyst; +#define _RS MIPS_GET_RS(op) +#define _RT MIPS_GET_RT(op) +#define _RD MIPS_GET_RD(op) +#define _FS MIPS_GET_FS(op) +#define _FT MIPS_GET_FT(op) +#define _FD MIPS_GET_FD(op) +#define _SA MIPS_GET_SA(op) +#define _POS ((op>> 6) & 0x1F) +#define _SIZE ((op>>11) & 0x1F) +#define _IMM16 (signed short)(op & 0xFFFF) +#define _IMM26 (op & 0x03FFFFFF) + +// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly. +// Currently known non working ones should have DISABLE. + +//#define CONDITIONAL_DISABLE { Comp_Generic(op); return; } +#define CONDITIONAL_DISABLE ; +#define DISABLE { Comp_Generic(op); return; } + +namespace MIPSComp +{ +using namespace PpcJitConstants; + + static u32 EvalOr(u32 a, u32 b) { return a | b; } + static u32 EvalXor(u32 a, u32 b) { return a ^ b; } + static u32 EvalAnd(u32 a, u32 b) { return a & b; } + static u32 EvalAdd(u32 a, u32 b) { return a + b; } + static u32 EvalSub(u32 a, u32 b) { return a - b; } + static u32 EvalNor(u32 a, u32 b) { return ~(a | b); } + + // Utilities to reduce duplicated code + void Jit::CompType3(int rd, int rs, int rt, void (PPCXEmitter::*arith)(PPCReg Rd, PPCReg Ra, PPCReg Rb), u32 (*eval)(u32 a, u32 b), bool isSub) { + if (gpr.IsImm(rs) && gpr.IsImm(rt)) { + gpr.SetImm(rd, (*eval)(gpr.GetImm(rs), gpr.GetImm(rt))); + } else if (gpr.IsImm(rt)) { + u32 rtImm = gpr.GetImm(rt); + gpr.MapDirtyIn(rd, rs); + MOVI2R(SREG, rtImm); + (this->*arith)(gpr.R(rd), gpr.R(rs), SREG); + } else if (gpr.IsImm(rs)) { + u32 rsImm = gpr.GetImm(rs); + gpr.MapDirtyIn(rd, rt); + // TODO: Special case when rsImm can be represented as an Operand2 + MOVI2R(SREG, rsImm); + (this->*arith)(gpr.R(rd), SREG, gpr.R(rt)); + } else { + // Generic solution + gpr.MapDirtyInIn(rd, rs, rt); + (this->*arith)(gpr.R(rd), gpr.R(rs), gpr.R(rt)); + } + } + + void Jit::CompImmLogic(int rs, int rt, u32 uimm, void (PPCXEmitter::*arith)(PPCReg Rd, PPCReg Ra, unsigned short imm), u32 (*eval)(u32 a, u32 b)) + { + if (gpr.IsImm(rs)) { + gpr.SetImm(rt, (*eval)(gpr.GetImm(rs), uimm)); + } else { + gpr.MapDirtyIn(rt, rs); + (this->*arith)(gpr.R(rt), gpr.R(rs), uimm); + } + } + + void Jit::Comp_IType(MIPSOpcode op) + { + CONDITIONAL_DISABLE; + s32 simm = (s32)(s16)(op & 0xFFFF); // sign extension + u32 uimm = op & 0xFFFF; + u32 suimm = (u32)(s32)simm; + + int rt = _RT; + int rs = _RS; + + int o = op>>26; + + // noop, won't write to ZERO. + if (rt == 0) + return; + + switch (op >> 26) + { + + case 8: // same as addiu? + case 9: // R(rt) = R(rs) + simm; break; //addiu + { + if (gpr.IsImm(rs)) { + gpr.SetImm(rt, gpr.GetImm(rs) + simm); + } else { + gpr.MapDirtyIn(rt, rs); + ADDI(gpr.R(rt), gpr.R(rs), simm); + } + break; + } + + // Use with caution can change CR0 ! + case 12: CompImmLogic(rs, rt, uimm, &PPCXEmitter::ANDI, &EvalAnd); break; + // Safe + case 13: CompImmLogic(rs, rt, uimm, &PPCXEmitter::ORI, &EvalOr); break; + case 14: CompImmLogic(rs, rt, uimm, &PPCXEmitter::XORI, &EvalXor); break; + case 15: // R(rt) = uimm << 16; //lui + gpr.SetImm(rt, uimm << 16); + break; + + case 10: // slti - R(rt) = (s32)R(rs) < simm + if (gpr.IsImm(rs)) + { + gpr.SetImm(rt, (s32)gpr.GetImm(rs) < simm); + break; + } else { + //DISABLE; + gpr.MapDirtyIn(rt, rs); + + PPCReg ppc_rt = gpr.R(rt); + PPCReg ppc_rs = gpr.R(rs); + + MOVI2R(R0, 0); + ADDI(SREG, R0, uimm); + + SUBFC(R0, SREG, ppc_rs); + EQV(ppc_rt, SREG, ppc_rs); + SRWI(ppc_rt, ppc_rt, 31); + ADDZE(ppc_rt, ppc_rt); + RLWINM(ppc_rt, ppc_rt, 0, 31, 31); + //Break(); + break; + } + + case 11: //sltiu + if (gpr.IsImm(rs)) + { + gpr.SetImm(rt, gpr.GetImm(rs) < suimm); + break; + } else { + //DISABLE; + gpr.MapDirtyIn(rt, rs); + + PPCReg ppc_rt = gpr.R(rt); + + ADDI(SREG, R0, suimm); + SUBFC(ppc_rt, SREG, gpr.R(rs)); + SUBFE(ppc_rt, ppc_rt, ppc_rt); + NEG(ppc_rt, ppc_rt); + + break; + } + + default: + Comp_Generic(op); + break; + } + } + + void Jit::Comp_RType2(MIPSOpcode op) { + Comp_Generic(op); + } + + + void Jit::Comp_RType3(MIPSOpcode op) { + CONDITIONAL_DISABLE; + int rt = _RT; + int rs = _RS; + int rd = _RD; + + // noop, won't write to ZERO. + if (rd == 0) + return; + + u8 o = op & 63; + + switch (op & 63) + { + case 10: // if (R(rt) == 0) R(rd) = R(rs); break; //movz + if (rd == rs) + break; + if (!gpr.IsImm(rt)) + { + gpr.MapDirtyInIn(rd, rt, rs, false); + CMPI(gpr.R(rt), 0); + PpcGen::FixupBranch ptr; + + ptr = B_Cond(_BNE); + + MR(gpr.R(rd), gpr.R(rs)); + + SetJumpTarget(ptr); + + } + else if (gpr.GetImm(rt) == 0) + { + // Yes, this actually happens. + if (gpr.IsImm(rs)) + gpr.SetImm(rd, gpr.GetImm(rs)); + else + { + gpr.MapDirtyIn(rd, rs); + MR(gpr.R(rd), gpr.R(rs)); + } + } + break; + + case 11:// if (R(rt) != 0) R(rd) = R(rs); break; //movn + if (rd == rs) + break; + if (!gpr.IsImm(rt)) + { + gpr.MapDirtyInIn(rd, rt, rs, false); + CMPI(gpr.R(rt), 0); + + PpcGen::FixupBranch ptr; + + ptr = B_Cond(_BEQ); + + MR(gpr.R(rd), gpr.R(rs)); + + SetJumpTarget(ptr); + } + else if (gpr.GetImm(rt) != 0) + { + // Yes, this actually happens. + if (gpr.IsImm(rs)) + gpr.SetImm(rd, gpr.GetImm(rs)); + else + { + gpr.MapDirtyIn(rd, rs); + MR(gpr.R(rd), gpr.R(rs)); + } + } + break; + + case 32: //R(rd) = R(rs) + R(rt); break; //add + case 33: //R(rd) = R(rs) + R(rt); break; //addu + // Some optimized special cases + if (gpr.IsImm(rs) && gpr.GetImm(rs) == 0) { + gpr.MapDirtyIn(rd, rt); + MR(gpr.R(rd), gpr.R(rt)); + } else if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) { + gpr.MapDirtyIn(rd, rs); + MR(gpr.R(rd), gpr.R(rs)); + } else { + CompType3(rd, rs, rt, &PPCXEmitter::ADD, &EvalAdd); + } + break; + case 34: //R(rd) = R(rs) - R(rt); break; //sub + case 35: //R(rd) = R(rs) - R(rt); break; //subu + CompType3(rd, rs, rt, &PPCXEmitter::SUB, &EvalSub, true); + break; + case 36: //R(rd) = R(rs) & R(rt); break; //and + CompType3(rd, rs, rt, &PPCXEmitter::AND, &EvalAnd); + break; + case 37: //R(rd) = R(rs) | R(rt); break; //or + CompType3(rd, rs, rt, &PPCXEmitter::OR, &EvalOr); + break; + case 38: //R(rd) = R(rs) ^ R(rt); break; //xor/eor + CompType3(rd, rs, rt, &PPCXEmitter::XOR, &EvalXor); + break; + // Not tested ! + case 39: // R(rd) = ~(R(rs) | R(rt)); break; //nor + CompType3(rd, rs, rt, &PPCXEmitter::NOR, &EvalNor); + break; + + case 42: //R(rd) = (int)R(rs) < (int)R(rt); break; //slt + if (gpr.IsImm(rs) && gpr.IsImm(rt)) { + gpr.SetImm(rd, (s32)gpr.GetImm(rs) < (s32)gpr.GetImm(rt)); + } else { + gpr.MapDirtyInIn(rd, rs, rt); + + PPCReg ppc_rd = gpr.R(rd); + PPCReg ppc_rs = gpr.R(rs); + PPCReg ppc_rt = gpr.R(rt); + + SUBFC(R0, ppc_rt, ppc_rs); + EQV(ppc_rd, ppc_rt, ppc_rs); + SRWI(ppc_rd, ppc_rd, 31); + ADDZE(ppc_rd, ppc_rd); + RLWINM(ppc_rd, ppc_rd, 0, 31, 31); + } + + break; + + case 43: //R(rd) = R(rs) < R(rt); break; //sltu + if (gpr.IsImm(rs) && gpr.IsImm(rt)) { + gpr.SetImm(rd, gpr.GetImm(rs) < gpr.GetImm(rt)); + } else { + gpr.MapDirtyInIn(rd, rs, rt); + + PPCReg ppc_rd = gpr.R(rd); + + SUBFC(ppc_rd, gpr.R(rt), gpr.R(rs)); + SUBFE(ppc_rd, ppc_rd, ppc_rd); + NEG(ppc_rd, ppc_rd); + } + break; + + + case 44:// R(rd) = ((s32)R(rs) > (s32)R(rt)) ? R(rs) : R(rt); break; //max + DISABLE; + if (gpr.IsImm(rs) && gpr.IsImm(rt)) + gpr.SetImm(rd, std::max((s32)gpr.GetImm(rs), (s32)gpr.GetImm(rt))); + else + { + gpr.MapDirtyInIn(rd, rs, rt); + PpcGen::FixupBranch end; + + // by default rd = rt + MR(gpr.R(rd), gpr.R(rt)); + + // if rs > rt => end + CMP(gpr.R(rs), gpr.R(rt)); + end = B_Cond(_BLE); + + // rd = rs + MR(gpr.R(rd), gpr.R(rs)); + + SetJumpTarget(end); + } + break; + + case 45: //min + DISABLE; + if (gpr.IsImm(rs) && gpr.IsImm(rt)) + gpr.SetImm(rd, std::min((s32)gpr.GetImm(rs), (s32)gpr.GetImm(rt))); + else + { + gpr.MapDirtyInIn(rd, rs, rt); + PpcGen::FixupBranch end; + + // by default rd = rt + MR(gpr.R(rd), gpr.R(rt)); + + // if rs < rt => end + CMP(gpr.R(rs), gpr.R(rt)); + end = B_Cond(_BGE); + + // rd = rs + MR(gpr.R(rd), gpr.R(rs)); + + SetJumpTarget(end); + } + break; + + + default: + Comp_Generic(op); + break; + } + } + + /** + * srl/srlv are disabled because they crash rr2 + **/ + void Jit::Comp_ShiftType(MIPSOpcode op) { + CONDITIONAL_DISABLE; + int rs = _RS; + int rd = _RD; + int fd = _FD; + int rt = _RT; + int sa = _SA; + + // noop, won't write to ZERO. + if (rd == 0) + return; + + // WARNING : ROTR + switch (op & 0x3f) + { + case 0: //sll + gpr.MapDirtyIn(rd, rt); + SLWI(gpr.R(rd), gpr.R(rt), sa); + break; + + case 2: + DISABLE; + if (rs == 0) // srl + { + gpr.MapDirtyIn(rd, rt); + SRWI(gpr.R(rd), gpr.R(rt), sa); + //Break(); + break; + } + else // rotr + { + gpr.MapDirtyIn(rd, rt); + ROTRWI(gpr.R(rd), gpr.R(rt), sa); + Break(); + break; + } + + case 3: //sra + gpr.MapDirtyIn(rd, rt); + SRAWI(gpr.R(rd), gpr.R(rt), sa); + break; + + case 4: //sllv + if (gpr.IsImm(rs)) + { + int sa = gpr.GetImm(rs) & 0x1F; + gpr.MapDirtyIn(rd, rt); + SLWI(gpr.R(rd), gpr.R(rt), sa); + break; + } + gpr.MapDirtyInIn(rd, rs, rt); + ANDI(SREG, gpr.R(rs), 0x1F); + SLW(gpr.R(rd), gpr.R(rt), SREG); + break; + + case 6: + DISABLE; + if ( fd == 0) { //srlv + if (gpr.IsImm(rs)) + { + int sa = gpr.GetImm(rs) & 0x1F; + gpr.MapDirtyIn(rd, rt); + SRWI(gpr.R(rd), gpr.R(rt), sa); + break; + } else { + gpr.MapDirtyInIn(rd, rs, rt); + ANDI(SREG, gpr.R(rs), 0x1F); + SRW(gpr.R(rd), gpr.R(rt), SREG); + break; + } + } else { // rotrv + if (gpr.IsImm(rs)) + { + int sa = gpr.GetImm(rs) & 0x1F; + gpr.MapDirtyIn(rd, rt); + ROTRWI(gpr.R(rd), gpr.R(rt), sa); + break; + } + // Not made + DISABLE; + } + break; + + case 7: //srav + if (gpr.IsImm(rs)) + { + int sa = gpr.GetImm(rs) & 0x1F; + gpr.MapDirtyIn(rd, rt); + SRAWI(gpr.R(rd), gpr.R(rt), sa); + break; + } + gpr.MapDirtyInIn(rd, rs, rt); + ANDI(SREG, gpr.R(rs), 0x1F); + SRAW(gpr.R(rd), gpr.R(rt), SREG); + break; + + default: + Comp_Generic(op); + break; + } + } + + void Jit::Comp_Allegrex(MIPSOpcode op) { + Comp_Generic(op); + } + + void Jit::Comp_Allegrex2(MIPSOpcode op) { + Comp_Generic(op); + } + + void Jit::Comp_MulDivType(MIPSOpcode op) { + CONDITIONAL_DISABLE; + MIPSGPReg rt = _RT; + MIPSGPReg rs = _RS; + int rd = _RD; + + switch (op & 63) + { + case 16: // R(rd) = HI; //mfhi + gpr.MapDirtyIn(rd, MIPSREG_HI); + MR(gpr.R(rd), gpr.R(MIPSREG_HI)); + break; + + case 17: // HI = R(rs); //mthi + gpr.MapDirtyIn(MIPSREG_HI, rs); + MR(gpr.R(MIPSREG_HI), gpr.R(rs)); + break; + + case 18: // R(rd) = LO; break; //mflo + gpr.MapDirtyIn(rd, MIPSREG_LO); + MR(gpr.R(rd), gpr.R(MIPSREG_LO)); + break; + + case 19: // LO = R(rs); break; //mtlo + gpr.MapDirtyIn(MIPSREG_LO, rs); + MR(gpr.R(MIPSREG_LO), gpr.R(rs)); + break; + + case 24: //mult (the most popular one). lo,hi = signed mul (rs * rt) + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt); + MULLW(gpr.R(MIPSREG_LO), gpr.R(rs), gpr.R(rt)); + MULHW(gpr.R(MIPSREG_HI), gpr.R(rs), gpr.R(rt)); + break; + + case 25: //multu (2nd) lo,hi = unsigned mul (rs * rt) + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt); + MULLW(gpr.R(MIPSREG_LO), gpr.R(rs), gpr.R(rt)); + MULHWU(gpr.R(MIPSREG_HI), gpr.R(rs), gpr.R(rt)); + break; + + case 26: //div + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt); + DIVW(gpr.R(MIPSREG_LO), gpr.R(rs), gpr.R(rt)); + MULLW(SREG, gpr.R(rt), gpr.R(MIPSREG_LO)); + SUB(gpr.R(MIPSREG_HI), gpr.R(rs), SREG); + break; + + case 27: //divu + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt); + DIVWU(gpr.R(MIPSREG_LO), gpr.R(rs), gpr.R(rt)); + MULLW(SREG, gpr.R(rt), gpr.R(MIPSREG_LO)); + SUB(gpr.R(MIPSREG_HI), gpr.R(rs), SREG); + break; + + case 28: //madd + DISABLE; + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt, false); + break; + + case 29: //maddu + DISABLE; + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt, false); + break; + + case 46: // msub + DISABLE; + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt, false); + break; + + case 47: // msubu + DISABLE; + gpr.MapDirtyDirtyInIn(MIPSREG_LO, MIPSREG_HI, rs, rt, false); + break; + + default: + DISABLE; + } + } + + void Jit::Comp_Special3(MIPSOpcode op) { + Comp_Generic(op); + } + +} + +#endif diff --git a/Core/MIPS/PPC/PpcCompBranch.cpp b/Core/MIPS/PPC/PpcCompBranch.cpp new file mode 100644 index 000000000000..faed0a6458bf --- /dev/null +++ b/Core/MIPS/PPC/PpcCompBranch.cpp @@ -0,0 +1,450 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/Serialize/Serializer.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" + +#include "Core/Reporting.h" +#include "Core/HLE/HLE.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +//#include + + +#define _RS MIPS_GET_RS(op) +#define _RT MIPS_GET_RT(op) +#define _RD MIPS_GET_RD(op) +#define _FS MIPS_GET_FS(op) +#define _FT MIPS_GET_FT(op) +#define _FD MIPS_GET_FD(op) +#define _SA MIPS_GET_SA(op) +#define _POS ((op>> 6) & 0x1F) +#define _SIZE ((op>>11) & 0x1F) +#define _IMM16 (signed short)(op & 0xFFFF) +#define _IMM26 (op & 0x03FFFFFF) + +#define LOOPOPTIMIZATION 0 + +// We can disable nice delay slots. +#define CONDITIONAL_NICE_DELAYSLOT delaySlotIsNice = false; +// #define CONDITIONAL_NICE_DELAYSLOT ; + +#define SHOW_JS_COMPILER_PC { printf("js.compilerPC: %08x\n", js.compilerPC); } + +#define BRANCH_COMPILE_LOG { printf("JIT(%8x): %s => %d - %08x\n", (u32)GetCodePtr() ,__FUNCTION__, cc, js.compilerPC); } + +using namespace MIPSAnalyst; + +using namespace PpcGen; +using namespace PpcJitConstants; + +namespace MIPSComp +{ + +void Jit::BranchRSRTComp(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely) +{ + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in RSRTComp delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + int offset = (signed short)(op&0xFFFF)<<2; + MIPSGPReg rt = _RT; + MIPSGPReg rs = _RS; + u32 targetAddr = js.compilerPC + offset + 4; + + MIPSOpcode delaySlotOp = Memory::Read_Instruction(js.compilerPC+4); + bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rt, rs); + CONDITIONAL_NICE_DELAYSLOT; + if (!likely && delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_NICE); + + + if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) + { + gpr.MapReg(rs); + CMPLI(gpr.R(rs), 0); + } + else if (gpr.IsImm(rs) && gpr.GetImm(rs) == 0) // only these are easily 'flippable' + { + gpr.MapReg(rt); + CMPLI(gpr.R(rt), 0); + } + else + { + gpr.MapInIn(rs, rt); + CMPL(gpr.R(rs), gpr.R(rt)); + } + + PpcGen::FixupBranch ptr; + if (!likely) + { + if (!delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_SAFE_FLUSH); + else + FlushAll(); + ptr = B_Cond(cc); + } + else + { + FlushAll(); + ptr = B_Cond(cc); + CompileDelaySlot(DELAYSLOT_FLUSH); + } + + // Take the branch + WriteExit(targetAddr, 0); + + SetJumpTarget(ptr); + + // Not taken + WriteExit(js.compilerPC+8, 1); + + js.compiling = false; +} + + +void Jit::BranchRSZeroComp(MIPSOpcode op, PpcGen::FixupBranchType cc, bool andLink, bool likely) +{ + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in RSZeroComp delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + int offset = (signed short)(op&0xFFFF)<<2; + MIPSGPReg rs = _RS; + u32 targetAddr = js.compilerPC + offset + 4; + + MIPSOpcode delaySlotOp = Memory::Read_Instruction(js.compilerPC + 4); + bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs); + CONDITIONAL_NICE_DELAYSLOT; + if (!likely && delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_NICE); + + gpr.MapReg(rs); + CMPI(gpr.R(rs), 0); + + PpcGen::FixupBranch ptr; + if (!likely) + { + if (!delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_SAFE_FLUSH); + else + FlushAll(); + ptr = B_Cond(cc); + } + else + { + FlushAll(); + ptr = B_Cond(cc); + CompileDelaySlot(DELAYSLOT_FLUSH); + } + + // Take the branch + if (andLink) + { + MOVI2R(SREG, js.compilerPC + 8); + STW(SREG, CTXREG, MIPS_REG_RA * 4); + } + + WriteExit(targetAddr, 0); + + SetJumpTarget(ptr); + + // Not taken + WriteExit(js.compilerPC + 8, 1); + + js.compiling = false; +} + +void Jit::Comp_RelBranch(MIPSOpcode op) { + // The CC flags here should be opposite of the actual branch becuase they skip the branching action. + switch (op>>26) + { + case 4: BranchRSRTComp(op, _BNE, false); break;//beq + case 5: BranchRSRTComp(op, _BEQ, false); break;//bne + + case 6: BranchRSZeroComp(op, _BGT, false, false); break;//blez + case 7: BranchRSZeroComp(op, _BLE, false, false); break;//bgtz + + case 20: BranchRSRTComp(op, _BNE, true); break;//beql + case 21: BranchRSRTComp(op, _BEQ, true); break;//bnel + + case 22: BranchRSZeroComp(op, _BGT, false, true); break;//blezl + case 23: BranchRSZeroComp(op, _BLE, false, true); break;//bgtzl + + default: + _dbg_assert_msg_(0,"Trying to compile instruction that can't be compiled"); + break; + } + js.compiling = false; +} + +void Jit::Comp_RelBranchRI(MIPSOpcode op) { + switch ((op >> 16) & 0x1F) + { + case 0: BranchRSZeroComp(op, _BGE, false, false); break; //if ((s32)R(rs) < 0) DelayBranchTo(addr); else PC += 4; break;//bltz + case 1: BranchRSZeroComp(op, _BLT, false, false); break; //if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 4; break;//bgez + case 2: BranchRSZeroComp(op, _BGE, false, true); break; //if ((s32)R(rs) < 0) DelayBranchTo(addr); else PC += 8; break;//bltzl + case 3: BranchRSZeroComp(op, _BLT, false, true); break; //if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 8; break;//bgezl + case 16: BranchRSZeroComp(op, _BGE, true, false); break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) < 0) DelayBranchTo(addr); else PC += 4; break;//bltzal + case 17: BranchRSZeroComp(op, _BLT, true, false); break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) >= 0) DelayBranchTo(addr); else PC += 4; break;//bgezal + case 18: BranchRSZeroComp(op, _BGE, true, true); break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) < 0) DelayBranchTo(addr); else SkipLikely(); break;//bltzall + case 19: BranchRSZeroComp(op, _BLT, true, true); break; //R(MIPS_REG_RA) = PC + 8; if ((s32)R(rs) >= 0) DelayBranchTo(addr); else SkipLikely(); break;//bgezall + default: + _dbg_assert_msg_(0,"Trying to compile instruction that can't be compiled"); + break; + } + js.compiling = false; +} + + +// If likely is set, discard the branch slot if NOT taken. +void Jit::BranchFPFlag(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely) +{ + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in FPFlag delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + int offset = (signed short)(op & 0xFFFF) << 2; + u32 targetAddr = js.compilerPC + offset + 4; + + MIPSOpcode delaySlotOp = Memory::Read_Instruction(js.compilerPC + 4); + bool delaySlotIsNice = IsDelaySlotNiceFPU(op, delaySlotOp); + CONDITIONAL_NICE_DELAYSLOT; + if (!likely && delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_NICE); + + FlushAll(); + + LWZ(SREG, CTXREG, offsetof(MIPSState, fpcond)); + // change CR0 + ANDI(SREG, SREG, 1); + + PpcGen::FixupBranch ptr; + if (!likely) + { + if (!delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_SAFE_FLUSH); + ptr = B_Cond(cc); + } + else + { + ptr = B_Cond(cc); + CompileDelaySlot(DELAYSLOT_FLUSH); + } + + // Take the branch + WriteExit(targetAddr, 0); + + SetJumpTarget(ptr); + // Not taken + WriteExit(js.compilerPC + 8, 1); + js.compiling = false; +} + +void Jit::Comp_FPUBranch(MIPSOpcode op) { + switch((op >> 16) & 0x1f) + { + case 0: BranchFPFlag(op, _BNE, false); break; // bc1f + case 1: BranchFPFlag(op, _BEQ, false); break; // bc1t + case 2: BranchFPFlag(op, _BNE, true); break; // bc1fl + case 3: BranchFPFlag(op, _BEQ, true); break; // bc1tl + default: + _dbg_assert_msg_(0,"Trying to interpret instruction that can't be interpreted"); + break; + } + js.compiling = false; +} + + +// If likely is set, discard the branch slot if NOT taken. +void Jit::BranchVFPUFlag(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely) +{ + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in VFPU delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + int offset = (signed short)(op & 0xFFFF) << 2; + u32 targetAddr = js.compilerPC + offset + 4; + + MIPSOpcode delaySlotOp = Memory::Read_Instruction(js.compilerPC + 4); + + bool delaySlotIsNice = IsDelaySlotNiceVFPU(op, delaySlotOp); + CONDITIONAL_NICE_DELAYSLOT; + if (!likely && delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_NICE); + + FlushAll(); + + int imm3 = (op >> 18) & 7; + + + MOVI2R(SREG, (u32)&(mips_->vfpuCtrl[VFPU_CTRL_CC])); + LWZ(SREG, SREG, 0); + // change CR0 + ANDI(SREG, SREG, 1 << imm3); + + PpcGen::FixupBranch ptr; + js.inDelaySlot = true; + if (!likely) + { + if (!delaySlotIsNice) + CompileDelaySlot(DELAYSLOT_SAFE_FLUSH); + ptr = B_Cond(cc); + } + else + { + ptr = B_Cond(cc); + CompileDelaySlot(DELAYSLOT_FLUSH); + } + js.inDelaySlot = false; + + // Take the branch + WriteExit(targetAddr, 0); + + SetJumpTarget(ptr); + + // Not taken + WriteExit(js.compilerPC + 8, 1); + js.compiling = false; +} + +void Jit::Comp_VBranch(MIPSOpcode op) { + switch ((op >> 16) & 3) + { + case 0: BranchVFPUFlag(op, _BNE, false); break; // bvf + case 1: BranchVFPUFlag(op, _BEQ, false); break; // bvt + case 2: BranchVFPUFlag(op, _BNE, true); break; // bvfl + case 3: BranchVFPUFlag(op, _BEQ, true); break; // bvtl + } + js.compiling = false; +} + +void Jit::Comp_Jump(MIPSOpcode op) { + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in Jump delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + u32 off = ((op & 0x03FFFFFF) << 2); + u32 targetAddr = (js.compilerPC & 0xF0000000) | off; + + switch (op >> 26) + { + case 2: //j + CompileDelaySlot(DELAYSLOT_NICE); + FlushAll(); + WriteExit(targetAddr, 0); + break; + + case 3: //jal + gpr.MapReg(MIPS_REG_RA, MAP_NOINIT | MAP_DIRTY); + MOVI2R(gpr.R(MIPS_REG_RA), js.compilerPC + 8); + CompileDelaySlot(DELAYSLOT_NICE); + FlushAll(); + WriteExit(targetAddr, 0); + break; + + default: + _dbg_assert_msg_(0,"Trying to compile instruction that can't be compiled"); + break; + } + js.compiling = false; +} + +void Jit::Comp_JumpReg(MIPSOpcode op) { + if (js.inDelaySlot) { + ERROR_LOG_REPORT(JIT, "Branch in JumpReg delay slot at %08x in block starting at %08x", js.compilerPC, js.blockStart); + return; + } + MIPSGPReg rs = _RS; + MIPSGPReg rd = _RD; + + MIPSOpcode delaySlotOp = Memory::Read_Instruction(js.compilerPC + 4); + bool delaySlotIsNice = IsDelaySlotNiceReg(op, delaySlotOp, rs); + CONDITIONAL_NICE_DELAYSLOT; + + if (IsSyscall(delaySlotOp)) { + gpr.MapReg(rs); + PPCReg mRs = gpr.R(rs); + MR(FLAGREG, mRs); + MovToPC(FLAGREG); // For syscall to be able to return. + CompileDelaySlot(DELAYSLOT_FLUSH); + return; // Syscall wrote exit code. + } else if (delaySlotIsNice) { + CompileDelaySlot(DELAYSLOT_NICE); + gpr.MapReg(rs); + PPCReg mRs = gpr.R(rs); + MR(FLAGREG, mRs); // Save the destination address through the delay slot. Could use isNice to avoid when the jit is fully implemented + FlushAll(); + } else { + // Delay slot + gpr.MapReg(rs); + PPCReg mRs = gpr.R(rs); + MR(FLAGREG, mRs); // Save the destination address through the delay slot. Could use isNice to avoid when the jit is fully implemented + CompileDelaySlot(DELAYSLOT_NICE); + FlushAll(); + } + + switch (op & 0x3f) + { + case 8: //jr + break; + case 9: //jalr + // mips->reg = js.compilerPC + 8; + MOVI2R(SREG, js.compilerPC + 8); + STW(SREG, CTXREG, (int)rd * 4); + break; + default: + _dbg_assert_msg_(0,"Trying to compile instruction that can't be compiled"); + break; + } + + WriteExitDestInR(FLAGREG); + js.compiling = false; +} + +void Jit::Comp_Syscall(MIPSOpcode op) { + FlushAll(); + + // If we're in a delay slot, this is off by one. + const int offset = js.inDelaySlot ? -1 : 0; + WriteDownCount(offset); + js.downcountAmount = -offset; + + // CallSyscall(op); + MOVI2R(R3, op.encoding); + SaveDowncount(DCNTREG); +#ifdef __wiiu__ + // R3 is expected to contain the address of the 4-byte MIPSOpcode struct ... + ADDI(R1, R1, -8); + STWU(R3, R1, 0); + MR(R3, R1); +#endif + QuickCallFunction((void *)&CallSyscall); +#ifdef __wiiu__ + ADDI(R1, R1, 8); +#endif + + RestoreDowncount(DCNTREG); + + WriteSyscallExit(); + js.compiling = false; +} + +void Jit::Comp_Break(MIPSOpcode op) { + Comp_Generic(op); + WriteSyscallExit(); + js.compiling = false; +} + + +} +#endif diff --git a/Core/MIPS/PPC/PpcCompFpu.cpp b/Core/MIPS/PPC/PpcCompFpu.cpp new file mode 100644 index 000000000000..64b6e82cd67c --- /dev/null +++ b/Core/MIPS/PPC/PpcCompFpu.cpp @@ -0,0 +1,470 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/Serialize/Serializer.h" +#include "Core/Config.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +#define _RS MIPS_GET_RS(op) +#define _RT MIPS_GET_RT(op) +#define _RD MIPS_GET_RD(op) +#define _FS MIPS_GET_FS(op) +#define _FT MIPS_GET_FT(op) +#define _FD MIPS_GET_FD(op) +#define _SA MIPS_GET_SA(op) +#define _POS ((op>> 6) & 0x1F) +#define _SIZE ((op>>11) & 0x1F) +#define _IMM16 (signed short)(op & 0xFFFF) +#define _IMM26 (op & 0x03FFFFFF) + +// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly. +// Currently known non working ones should have DISABLE. + +//#define CONDITIONAL_DISABLE { Comp_Generic(op); return; } +#define CONDITIONAL_DISABLE ; +#define DISABLE { Comp_Generic(op); return; } + +using namespace PpcGen; +using namespace PpcJitConstants; + +namespace MIPSComp +{ + +void Jit::Comp_FPU3op(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int ft = _FT; + int fs = _FS; + int fd = _FD; + + fpr.MapDirtyInIn(fd, fs, ft); + switch (op & 0x3f) + { + case 0: FADDS(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) + F(ft); //add + case 1: FSUBS(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) - F(ft); //sub + case 2: { //F(fd) = F(fs) * F(ft); //mul + FMULS(fpr.R(fd), fpr.R(fs), fpr.R(ft)); + break; + } + case 3: FDIVS(fpr.R(fd), fpr.R(fs), fpr.R(ft)); break; //F(fd) = F(fs) / F(ft); //div + default: + DISABLE; + return; + } +} + +void Jit::Comp_FPULS(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + s32 offset = (s16)(op & 0xFFFF); + int ft = _FT; + int rs = _RS; + // u32 addr = R(rs) + offset; + // logBlocks = 1; + bool doCheck = false; + + if (!g_Config.bFastMemory) { + DISABLE; + } + + switch(op >> 26) + { + case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1 + fpr.SpillLock(ft); + fpr.MapReg(ft, MAP_NOINIT | MAP_DIRTY); + if (gpr.IsImm(rs)) { + u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, offset); + } + + LoadFloatSwap(fpr.R(ft), BASEREG, SREG); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + break; + + case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1 + fpr.MapReg(ft); + if (gpr.IsImm(rs)) { + u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, offset); + } + + SaveFloatSwap(fpr.R(ft), BASEREG, SREG); + break; + + default: + Comp_Generic(op); + return; + } +} + +#if 0 +/** +This can be made with branch, but i'm trying to do it branch free, not working correctly yet ... +**/ +void Jit::Comp_FPUComp(MIPSOpcode op) { + DISABLE; + CONDITIONAL_DISABLE; + + + int opc = op & 0xF; + if (opc >= 8) opc -= 8; // alias + if (opc == 0) { // f, sf (signalling false) + MOVI2R(SREG, 0); + STW(SREG, CTXREG, offsetof(MIPSState, fpcond)); + return; + } + + int fs = _FS; + int ft = _FT; + fpr.MapInIn(fs, ft); + + PPCReg _tmp = FPR8; + PPCReg _zero = FPR6; + PPCReg _one = FPR7; + + //VCMP(fpr.R(fs), fpr.R(ft)); + //VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags). + + /** + Condition-Register Field and Floating-Point Condition Code Interpretation + Bit Name Description + 1000 FL (FRA) < (FRB) + 0100 FG (FRA) > (FRB) + 0010 FE (FRA) = (FRB) + 0001 FU (FRA) ? (FRB) (unordered) + **/ + + switch(opc) + { + // OK + case 1: // un, ngle (unordered) + FCMPU(0, fpr.R(fs), fpr.R(ft)); + MFCR(SREG); + SRAWI(SREG, SREG, 28); + ANDI(SREG, SREG, 0x1); + break; + // FAIL + case 2: //eq, seq (equal, ordered) + DISABLE; + //Break(); + FCMPO(0, fpr.R(fs), fpr.R(ft)); + MFCR(SREG); + SRAWI(SREG, SREG, 28); + SRAWI(SREG, SREG, 2); + ANDI(SREG, SREG, 0x1); + break; + // FAIL + case 3: // ueq, ngl (equal, unordered) + DISABLE; + //Break(); + FCMPU(0, fpr.R(fs), fpr.R(ft)); + MFCR(R7); + SRAWI(R7, R7, 28); + + SRAWI(SREG, R7, 2); + ANDI(SREG, SREG, 0x1); + + // check unordered + ANDI(R7, R7, 0x1); + // SREG = ((R7 >> 2) & 1) || ((R8 >> 3) & 1) + OR(SREG, R7, SREG); + return; + // OK + case 4: // olt, lt (less than, ordered) + //DISABLE; + //Break(); + FCMPO(0, fpr.R(fs), fpr.R(ft)); + MFCR(SREG); + + SRAWI(SREG, SREG, 28); + SRAWI(SREG, SREG, 3); + + // SREG = SREG & 1 + ANDI(SREG, SREG, 0x1); + break; + // OK + case 5: // ult, nge (less than, unordered) + //DISABLE; + //Break(); + FCMPO(0, fpr.R(fs), fpr.R(ft)); + MFCR(R7); + SRAWI(R7, R7, 28); + + // SREG = SREG & 1 + SRAWI(SREG, R7, 3); + ANDI(SREG, SREG, 0x1); + + // check unordered + ANDI(R7, R7, 0x1); + + // final + OR(SREG, R7, SREG); + break; + // FAIL + case 6: // ole, le (less equal, ordered) + DISABLE; + //Break(); + FCMPO(0, fpr.R(ft), fpr.R(fs)); + MFCR(SREG); + SRAWI(SREG, SREG, 28); + + // SREG = (SREG >> 1) & 1 + SRAWI(SREG, SREG, 3); + ANDI(SREG, SREG, 0x1); + break; + // FAIL + case 7: // ule, ngt (less equal, unordered) + DISABLE; + //Break(); + FCMPO(0, fpr.R(ft), fpr.R(fs)); + MFCR(R7); + SRAWI(R7, R7, 28); + + // SREG = (SREG >> 1) & 1 + SRAWI(SREG, R7, 1); + ANDI(SREG, SREG, 0x1); + // check unordered + // R8 = (R7 >> 3) & 1 + SRAWI(R8, R7, 3); + ANDI(R8, R8, 0x1); + // SREG = (R7 & 1) || ((R8 >> 3) & 1) + OR(SREG, R7, R8); + break; + default: + Comp_Generic(op); + return; + } + STW(SREG, CTXREG, offsetof(MIPSState, fpcond)); +} +#else +/** +* 2nd attempt +**/ +void Jit::FPUComp(int fs, int ft, PpcGen::FixupBranchType cond, bool unorderer, int bf) { + PpcGen::FixupBranch ptr; + + // Default result + MOVI2R(SREG, 1); + + // Compare + FCMPU(0, fpr.R(fs), fpr.R(ft)); + + if (unorderer) { + // 3 = UN + CROR(bf, bf, 3); + } + + // If result is good jump + ptr = B_Cond(cond); + + MOVI2R(SREG, 0); + + SetJumpTarget(ptr); +} + +/** https://github.com/gligli/mupen64-360/blob/42bf04f370f00f16be17f3ba9f74b420a7d86422/source/r4300/ppc/MIPS-to-PPC.c#L2916 **/ +/** +Condition-Register Field and Floating-Point Condition Code Interpretation +Bit Name Description +0 FL (FRA) < (FRB) +1 FG (FRA) > (FRB) +2 FE (FRA) = (FRB) +3 FU (FRA) ? (FRB) (unordered) +**/ +void Jit::Comp_FPUComp(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int opc = op & 0xF; + if (opc >= 8) opc -= 8; // alias + if (opc == 0) { // f, sf (signalling false) + MOVI2R(SREG, 0); + STW(SREG, CTXREG, offsetof(MIPSState, fpcond)); + return; + } + + int fs = _FS; + int ft = _FT; + fpr.MapInIn(fs, ft); + + switch(opc) + { + // FAIL + case 1: // un, ngle (unordered) + DISABLE; + break; + // OK + case 2: //eq, seq (equal, ordered) + FPUComp(fs, ft, _BEQ); + break; + // FAIL + case 3: // ueq, ngl (equal, unordered) + FPUComp(fs, ft, _BEQ, true, 2); + break; + // OK + case 4: // olt, lt (less than, ordered) + FPUComp(fs, ft, _BLT); + break; + // FAIL + case 5: // ult, nge (less than, unordered) + FPUComp(fs, ft, _BLT, true, 0); + break; + // OK + case 6: // ole, le (less equal, ordered) + FPUComp(fs, ft, _BLE, true, 1); + break; + // FAIL + case 7: // ule, ngt (less equal, unordered) + FPUComp(fs, ft, _BLE); + break; + default: + Comp_Generic(op); + return; + } + STW(SREG, CTXREG, offsetof(MIPSState, fpcond)); +} + +#endif + +void Jit::Comp_FPU2op(MIPSOpcode op) { + DISABLE + CONDITIONAL_DISABLE; + + int fs = _FS; + int fd = _FD; + + switch (op & 0x3f) + { +#if !PPSSPP_ARCH(PPC750) + case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt + fpr.MapDirtyIn(fd, fs); + FSQRTS(fpr.R(fd), fpr.R(fs)); + break; +#endif + case 5: //F(fd) = fabsf(F(fs)); break; //abs + fpr.MapDirtyIn(fd, fs); + FABS(fpr.R(fd), fpr.R(fs)); + break; + case 6: //F(fd) = F(fs); break; //mov + fpr.MapDirtyIn(fd, fs); + FMR(fpr.R(fd), fpr.R(fs)); + break; + case 7: //F(fd) = -F(fs); break; //neg + fpr.MapDirtyIn(fd, fs); + FNEG(fpr.R(fd), fpr.R(fs)); + break; + + case 13: // FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break; //trunc.w.s + fpr.MapDirtyIn(fd, fs); + FRIZ(fpr.R(fd), fpr.R(fs)); + break; + /* + case 12: // FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s + case 14: // FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s + case 15: // FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s + case 32: // F(fd) = (float)FsI(fs); break; //cvt.s.w + + case 36: + //switch (currentMIPS->fcr31 & 3) + //{ + //case 0: FsI(fd) = (int)round_ieee_754(F(fs)); break; // RINT_0 + //case 1: FsI(fd) = (int)F(fs); break; // CAST_1 + //case 2: FsI(fd) = (int)ceilf(F(fs)); break; // CEIL_2 + //case 3: FsI(fd) = (int)floorf(F(fs)); break; // FLOOR_3 + //} + //break; //cvt.w.s + */ + default: + Comp_Generic(op); + break; + } +} + +/** +Seem to work +**/ +void Jit::Comp_mxc1(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int fs = _FS; + MIPSGPReg rt = _RT; + + switch ((op >> 21) & 0x1f) + { + case 0: // R(rt) = FI(fs); break; //mfc1 + // Let's just go through RAM for now. + fpr.FlushR(fs); + gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT); + LWZ(gpr.R(rt), CTXREG, fpr.GetMipsRegOffset(fs)); + return; + + case 2: //cfc1 + if (fs == 31) + { + /* Todo Lazy code ! */ + gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT); + PPCReg _rt = gpr.R(rt); + + // SREG = fpcond & 1; + LWZ(SREG, CTXREG, offsetof(MIPSState, fpcond)); + ANDI(SREG, SREG, 1); // Just in case + // SREG << 23 + SLWI(SREG, SREG, 23); + + // RT = fcr31 & ~(1<<23) + LWZ(_rt, CTXREG, offsetof(MIPSState, fcr31)); + RLWINM(_rt, _rt, 0, 9, 7); + + // RT = RT | SREG + OR(_rt, _rt, SREG); + } else if (fs == 0) { + gpr.SetImm(rt, MIPSState::FCR0_VALUE); + } else { + // Unsupported regs are always 0. + gpr.SetImm(rt, 0); + } + return; + + case 4: //FI(fs) = R(rt); break; //mtc1 + // Let's just go through RAM for now. + gpr.FlushR(rt); + fpr.MapReg(fs, MAP_DIRTY | MAP_NOINIT); + LFS(fpr.R(fs), CTXREG, gpr.GetMipsRegOffset(rt)); + return; + + case 6: //ctc1 + if (fs == 31) + { + gpr.MapReg(rt, 0); + + // Update MIPS state + // fcr31 = rt + STW(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr31)); + + // fpcond = (rt >> 23) & 1; + SRWI(SREG, gpr.R(rt), 23); + ANDI(SREG, SREG, 1); + STW(SREG, CTXREG, offsetof(MIPSState, fpcond)); + } + return; + } +} + +} +#endif diff --git a/Core/MIPS/PPC/PpcCompLoadStore.cpp b/Core/MIPS/PPC/PpcCompLoadStore.cpp new file mode 100644 index 000000000000..68984c4e59c8 --- /dev/null +++ b/Core/MIPS/PPC/PpcCompLoadStore.cpp @@ -0,0 +1,158 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/Serialize/Serializer.h" +#include "Core/Config.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + + +#define _RS ((op>>21) & 0x1F) +#define _RT ((op>>16) & 0x1F) +#define _RD ((op>>11) & 0x1F) +#define _FS ((op>>11) & 0x1F) +#define _FT ((op>>16) & 0x1F) +#define _FD ((op>>6 ) & 0x1F) +#define _POS ((op>>6 ) & 0x1F) +#define _SIZE ((op>>11) & 0x1F) + +// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly. +// Currently known non working ones should have DISABLE. + +//#define CONDITIONAL_DISABLE { Comp_Generic(op); return; } +#define CONDITIONAL_DISABLE ; +#define DISABLE { Comp_Generic(op); return; } + +using namespace PpcGen; +using namespace PpcJitConstants; + +namespace MIPSComp +{ + +void Jit::SetRegToEffectiveAddress(PpcGen::PPCReg r, int rs, s16 offset) { + if (offset) { + ADDI(SREG, gpr.R(rs), offset); + RLWINM(SREG, SREG, 0, 2, 31); // &= 0x3FFFFFFF + } else { + RLWINM(SREG, gpr.R(rs), 0, 2, 31); // &= 0x3FFFFFFF + } + +} +void Jit::Comp_ITypeMem(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int offset = (signed short)(op&0xFFFF); + bool load = false; + int rt = _RT; + int rs = _RS; + int o = op>>26; + if (((op >> 29) & 1) == 0 && rt == 0) { + // Don't load anything into $zr + return; + } + + if (!g_Config.bFastMemory) { + DISABLE; + } + + u32 iaddr = gpr.IsImm(rs) ? offset + gpr.GetImm(rs) : 0xFFFFFFFF; + bool doCheck = false; + + switch (o) + { + case 32: //lb + case 33: //lh + case 35: //lw + case 36: //lbu + case 37: //lhu + load = true; + case 40: //sb + case 41: //sh + case 43: //sw + + if (gpr.IsImm(rs) && Memory::IsValidAddress(iaddr)) { + // We can compute the full address at compile time. Kickass. + u32 addr = iaddr & 0x3FFFFFFF; + // Must be OK even if rs == rt since we have the value from imm already. + gpr.MapReg(rt, load ? MAP_NOINIT | MAP_DIRTY : 0); + MOVI2R(SREG, addr); + } else { + _dbg_assert_msg_(!gpr.IsImm(rs), "Invalid immediate address? CPU bug?"); + load ? gpr.MapDirtyIn(rt, rs) : gpr.MapInIn(rt, rs); + + SetRegToEffectiveAddress(SREG, rs, offset); + } + switch (o) + { + // Load + case 32: //lb + LBZX(gpr.R(rt), BASEREG, SREG); + EXTSB(gpr.R(rt), gpr.R(rt)); + break; + case 33: //lh + LHBRX(gpr.R(rt), BASEREG, SREG); + EXTSH(gpr.R(rt), gpr.R(rt)); + break; + case 35: //lw + LWBRX(gpr.R(rt), BASEREG, SREG); + break; + case 36: //lbu + LBZX (gpr.R(rt), BASEREG, SREG); + break; + case 37: //lhu + LHBRX (gpr.R(rt), BASEREG, SREG); + break; + // Store + case 40: //sb + STBX (gpr.R(rt), BASEREG, SREG); + break; + case 41: //sh + STHBRX(gpr.R(rt), BASEREG, SREG); + break; + case 43: //sw + STWBRX(gpr.R(rt), BASEREG, SREG); + break; + } + break; + case 34: //lwl + case 38: //lwr + load = true; + case 42: //swl + case 46: //swr + if (!js.inDelaySlot) { + // Optimisation: Combine to single unaligned load/store + bool isLeft = (o == 34 || o == 42); + MIPSOpcode nextOp = Memory::Read_Instruction(js.compilerPC + 4); + // Find a matching shift in opposite direction with opposite offset. + if (nextOp == (isLeft ? (op.encoding + (4<<26) - 3) + : (op.encoding - (4<<26) + 3))) + { + EatInstruction(nextOp); + nextOp = MIPSOpcode(((load ? 35 : 43) << 26) | ((isLeft ? nextOp : op) & 0x03FFFFFF)); //lw, sw + Comp_ITypeMem(nextOp); + return; + } + } + + DISABLE; // Disabled until crashes are resolved. + break; + default: + Comp_Generic(op); + return ; + } + } + + void Jit::Comp_Cache(MIPSOpcode op) { + DISABLE; + } +} +#endif diff --git a/Core/MIPS/PPC/PpcCompReplace.cpp b/Core/MIPS/PPC/PpcCompReplace.cpp new file mode 100644 index 000000000000..26f8e756e354 --- /dev/null +++ b/Core/MIPS/PPC/PpcCompReplace.cpp @@ -0,0 +1,45 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include +#include "math/math_util.h" + +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSAnalyst.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Common/CPUDetect.h" +#include "Core/Config.h" +#include "Core/Reporting.h" +#include "Core/MIPS/JitCommon/JitCommon.h" +#include "Core/MIPS/PPC/PpcRegCache.h" +#include "Core/MIPS/PPC/PpcJit.h" + +namespace MIPSComp { + +int Jit::Replace_fabsf() { + return -1; + // fpr.MapDirtyIn(0, 13); + // VABS(fpr.R(0), fpr.R(13)); + // return 6; // Number of instructions in the MIPS function +} + +} +#endif diff --git a/Core/MIPS/PPC/PpcCompVFPU.cpp b/Core/MIPS/PPC/PpcCompVFPU.cpp new file mode 100644 index 000000000000..83bdfbb3b1c4 --- /dev/null +++ b/Core/MIPS/PPC/PpcCompVFPU.cpp @@ -0,0 +1,1236 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "math/math_util.h" + +#include "Common/Serialize/Serializer.h" +#include "Core/MemMap.h" +#include "Core/Config.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" +#include "Core/MIPS/MIPSDebugInterface.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +const bool disablePrefixes = false; + +// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly. +// Currently known non working ones should have DISABLE. + +// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; } +#define CONDITIONAL_DISABLE ; +#define DISABLE { fpr.ReleaseSpillLocksAndDiscardTemps(); Comp_Generic(op); return; } + +#define _RS MIPS_GET_RS(op) +#define _RT MIPS_GET_RT(op) +#define _RD MIPS_GET_RD(op) +#define _FS MIPS_GET_FS(op) +#define _FT MIPS_GET_FT(op) +#define _FD MIPS_GET_FD(op) +#define _SA MIPS_GET_SA(op) +#define _POS ((op>> 6) & 0x1F) +#define _SIZE ((op>>11) & 0x1F) +#define _IMM16 (signed short)(op & 0xFFFF) +#define _IMM26 (op & 0x03FFFFFF) + +using namespace PpcGen; +using namespace PpcJitConstants; + +// #define USE_VMX128 + +namespace MIPSComp +{ + // Vector regs can overlap in all sorts of swizzled ways. + // This does allow a single overlap in sregs[i]. + static bool IsOverlapSafeAllowS(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) + { + for (int i = 0; i < sn; ++i) + { + if (sregs[i] == dreg && i != di) + return false; + } + for (int i = 0; i < tn; ++i) + { + if (tregs[i] == dreg) + return false; + } + + // Hurray, no overlap, we can write directly. + return true; + } + + static bool IsOverlapSafe(int dreg, int di, int sn, u8 sregs[], int tn = 0, u8 tregs[] = NULL) + { + return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg; + } + + void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) { + if (prefix == 0xE4) return; + + int n = GetNumVectorElements(sz); + u8 origV[4]; + static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f}; + + for (int i = 0; i < n; i++) + origV[i] = vregs[i]; + + for (int i = 0; i < n; i++) + { + int regnum = (prefix >> (i*2)) & 3; + int abs = (prefix >> (8+i)) & 1; + int negate = (prefix >> (16+i)) & 1; + int constants = (prefix >> (12+i)) & 1; + + // Unchanged, hurray. + if (!constants && regnum == i && !abs && !negate) + continue; + + // This puts the value into a temp reg, so we won't write the modified value back. + vregs[i] = fpr.GetTempV(); + if (!constants) { + fpr.MapDirtyInV(vregs[i], origV[regnum]); + fpr.SpillLockV(vregs[i]); + + // Prefix may say "z, z, z, z" but if this is a pair, we force to x. + // TODO: But some ops seem to use const 0 instead? + if (regnum >= n) { + WARN_LOG(CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, js.compilerPC, currentDebugMIPS->disasm(js.compilerPC, 0)); + regnum = 0; + } + + if (abs) { + FABS(fpr.V(vregs[i]), fpr.V(origV[regnum])); + if (negate) + FNEG(fpr.V(vregs[i]), fpr.V(vregs[i])); + } else { + if (negate) + FNEG(fpr.V(vregs[i]), fpr.V(origV[regnum])); + else + FMR(fpr.V(vregs[i]), fpr.V(origV[regnum])); + } + } else { + fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT); + fpr.SpillLockV(vregs[i]); + MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs<<2)], negate); + } + } + } + + void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) { + _assert_(js.prefixDFlag & PpcJitState::PREFIX_KNOWN); + + GetVectorRegs(regs, sz, vectorReg); + if (js.prefixD == 0) + return; + + int n = GetNumVectorElements(sz); + for (int i = 0; i < n; i++) { + // Hopefully this is rare, we'll just write it into a reg we drop. + if (js.VfpuWriteMask(i)) + regs[i] = fpr.GetTempV(); + } + } + + void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) { + _assert_(js.prefixDFlag & PpcJitState::PREFIX_KNOWN); + if (!js.prefixD) return; + + int n = GetNumVectorElements(sz); + for (int i = 0; i < n; i++) { + if (js.VfpuWriteMask(i)) + continue; + + // TODO: These clampers are wrong - put this into google + // and look at the plot: abs(x) - abs(x-0.5) + 0.5 + // It's too steep. + + // Also, they mishandle NaN and Inf. + int sat = (js.prefixD >> (i * 2)) & 3; + if (sat == 1) { + fpr.MapRegV(vregs[i], MAP_DIRTY); + + MOVI2F(FPR6, 0.0f); + MOVI2F(FPR7, 1.0f); + + FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), FPR6); + FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), FPR7); + } else if (sat == 3) { + fpr.MapRegV(vregs[i], MAP_DIRTY); + + MOVI2F(FPR6, -1.0f); + MOVI2F(FPR7, 1.0f); + + FMAX(fpr.V(vregs[i]), fpr.V(vregs[i]), FPR6); + FMIN(fpr.V(vregs[i]), fpr.V(vregs[i]), FPR7); + } + } + } + + void Jit::Comp_SV(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + s32 imm = (signed short)(op&0xFFFC); + int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5); + int rs = _RS; + + if (!g_Config.bFastMemory) { + DISABLE; + } + + bool doCheck = false; + switch (op >> 26) + { + case 50: //lv.s // VI(vt) = Memory::Read_U32(addr); + { + // CC might be set by slow path below, so load regs first. + fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT); + if (gpr.IsImm(rs)) { + u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, imm); + } + + LoadFloatSwap(fpr.V(vt), BASEREG, SREG); + } + break; + + case 58: //sv.s // Memory::Write_U32(VI(vt), addr); + { + // CC might be set by slow path below, so load regs first. + fpr.MapRegV(vt); + if (gpr.IsImm(rs)) { + u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, imm); + } + SaveFloatSwap(fpr.V(vt), BASEREG, SREG); + } + break; + + + default: + DISABLE; + } + } + + void Jit::Comp_SVQ(MIPSOpcode op) { + // Comp_Generic(op); + CONDITIONAL_DISABLE; + + int imm = (signed short)(op&0xFFFC); + int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5); + int rs = _RS; + + if (!g_Config.bFastMemory) { + DISABLE; + } + + bool doCheck = false; + switch (op >> 26) + { + case 54: //lv.q + { + u8 vregs[4]; + GetVectorRegs(vregs, V_Quad, vt); + fpr.MapRegsAndSpillLockV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT); + + if (gpr.IsImm(rs)) { + u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr + (u32)Memory::base); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, imm); + ADD(SREG, SREG, BASEREG); + } + + for (int i = 0; i < 4; i++) { + MOVI2R(R9, i * 4); + LoadFloatSwap(fpr.V(vregs[i]), SREG, R9); + } + } + break; + + case 62: //sv.q + { + // CC might be set by slow path below, so load regs first. + u8 vregs[4]; + GetVectorRegs(vregs, V_Quad, vt); + fpr.MapRegsAndSpillLockV(vregs, V_Quad, 0); + + if (gpr.IsImm(rs)) { + u32 addr = (imm + gpr.GetImm(rs)) & 0x3FFFFFFF; + MOVI2R(SREG, addr + (u32)Memory::base); + } else { + gpr.MapReg(rs); + SetRegToEffectiveAddress(SREG, rs, imm); + ADD(SREG, SREG, BASEREG); + } + + for (int i = 0; i < 4; i++) { + MOVI2R(R9, i * 4); + SaveFloatSwap(fpr.V(vregs[i]), SREG, R9); + } + } + break; + + default: + DISABLE; + break; + } + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VPFX(MIPSOpcode op) { + CONDITIONAL_DISABLE; + int data = op & 0xFFFFF; + int regnum = (op >> 24) & 3; + switch (regnum) { + case 0: // S + js.prefixS = data; + js.prefixSFlag = PpcJitState::PREFIX_KNOWN_DIRTY; + break; + case 1: // T + js.prefixT = data; + js.prefixTFlag = PpcJitState::PREFIX_KNOWN_DIRTY; + break; + case 2: // D + js.prefixD = data; + js.prefixDFlag = PpcJitState::PREFIX_KNOWN_DIRTY; + break; + default: + ERROR_LOG(CPU, "VPFX - bad regnum %i : data=%08x", regnum, data); + break; + } + } + + void Jit::Comp_VVectorInit(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + // WARNING: No prefix support! + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + switch ((op >> 16) & 0xF) + { + case 6: // v=zeros; break; //vzero + MOVI2F(FPR5, 0.0f); + break; + case 7: // v=ones; break; //vone + MOVI2F(FPR5, 1.0f); + break; + default: + DISABLE; + break; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 dregs[4]; + GetVectorRegsPrefixD(dregs, sz, _VD); + fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY); + + for (int i = 0; i < n; ++i) + FMR(fpr.V(dregs[i]), FPR5); + + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VMatrixInit(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + MatrixSize sz = GetMtxSize(op); + int n = GetMatrixSide(sz); + + u8 dregs[16]; + GetMatrixRegs(dregs, sz, _VD); + + switch ((op >> 16) & 0xF) { + case 3: // vmidt + MOVI2F(FPR6, 0.0f); + MOVI2F(FPR7, 1.0f); + for (int a = 0; a < n; a++) { + for (int b = 0; b < n; b++) { + fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT); + FMR(fpr.V(dregs[a * 4 + b]), a == b ? FPR7 : FPR6); + } + } + break; + case 6: // vmzero + MOVI2F(FPR6, 0.0f); + for (int a = 0; a < n; a++) { + for (int b = 0; b < n; b++) { + fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT); + FMR(fpr.V(dregs[a * 4 + b]), FPR6); + } + } + break; + case 7: // vmone + MOVI2F(FPR7, 1.0f); + for (int a = 0; a < n; a++) { + for (int b = 0; b < n; b++) { + fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT); + FMR(fpr.V(dregs[a * 4 + b]), FPR7); + } + } + break; + } + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VDot(MIPSOpcode op) { + CONDITIONAL_DISABLE; + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + int vd = _VD; + int vs = _VS; + int vt = _VT; + VectorSize sz = GetVecSize(op); + + // TODO: Force read one of them into regs? probably not. + u8 sregs[4], tregs[4], dregs[1]; + GetVectorRegsPrefixS(sregs, sz, vs); + GetVectorRegsPrefixT(tregs, sz, vt); + GetVectorRegsPrefixD(dregs, V_Single, vd); + + // TODO: applyprefixST here somehow (shuffle, etc...) + fpr.MapRegsAndSpillLockV(sregs, sz, 0); + fpr.MapRegsAndSpillLockV(tregs, sz, 0); + FMULS(FPR6, fpr.V(sregs[0]), fpr.V(tregs[0])); + + int n = GetNumVectorElements(sz); + for (int i = 1; i < n; i++) { + // sum += s[i]*t[i]; + FMADDS(FPR6, fpr.V(sregs[i]), fpr.V(tregs[i]), FPR6); + } + fpr.ReleaseSpillLocksAndDiscardTemps(); + + fpr.MapRegV(dregs[0], MAP_NOINIT | MAP_DIRTY); + + // TODO: applyprefixD here somehow (write mask etc..) + FMR(fpr.V(dregs[0]), FPR6); + ApplyPrefixD(dregs, V_Single); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VecDo3(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + int vd = _VD; + int vs = _VS; + int vt = _VT; + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], tregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixT(tregs, sz, _VT); + GetVectorRegsPrefixD(dregs, sz, _VD); + + MIPSReg tempregs[4]; + for (int i = 0; i < n; i++) { + if (!IsOverlapSafe(dregs[i], i, n, sregs, n, tregs)) { + tempregs[i] = fpr.GetTempV(); + } else { + tempregs[i] = dregs[i]; + } + } + + for (int i = 0; i < n; i++) { + fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]); + switch (op >> 26) { + case 24: //VFPU0 + switch ((op >> 23)&7) { + case 0: // d[i] = s[i] + t[i]; break; //vadd + FADDS(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + case 1: // d[i] = s[i] - t[i]; break; //vsub + FSUBS(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + case 7: // d[i] = s[i] / t[i]; break; //vdiv + FDIVS(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + default: + DISABLE; + } + break; + case 25: //VFPU1 + switch ((op >> 23) & 7) { + case 0: // d[i] = s[i] * t[i]; break; //vmul + FMULS(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + default: + DISABLE; + } + break; + case 27: //VFPU3 + // DISABLE + + switch ((op >> 23) & 7) { + case 2: // vmin + FMIN(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + case 3: // vmax + FMAX(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); + break; + case 6: // vsge + // DISABLE; // pending testing + MOVI2F(FPR6, 1.0f); + MOVI2F(FPR7, 0.0f); + FSUBS(FPR8, fpr.V(sregs[i]), fpr.V(tregs[i])); + FSEL(fpr.V(tempregs[i]), FPR8, FPR6, FPR7); + break; + case 7: // vslt + // DISABLE; // pending testing + MOVI2F(FPR6, 1.0f); + MOVI2F(FPR7, 0.0f); + FSUBS(FPR8, fpr.V(sregs[i]), fpr.V(tregs[i])); + FSEL(fpr.V(tempregs[i]), FPR8, FPR7, FPR6); + break; + } + break; + + default: + DISABLE; + } + } + + for (int i = 0; i < n; i++) { + if (dregs[i] != tempregs[i]) { + fpr.MapDirtyInV(dregs[i], tempregs[i]); + FMR(fpr.V(dregs[i]), fpr.V(tempregs[i])); + } + } + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VV2Op(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + // Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure + if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) { + return; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, sz, _VD); + + MIPSReg tempregs[4]; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], i, n, sregs)) { + tempregs[i] = fpr.GetTempV(); + } else { + tempregs[i] = dregs[i]; + } + } + + // Warning: sregs[i] and tempxregs[i] may be the same reg. + // Helps for vmov, hurts for vrcp, etc. + for (int i = 0; i < n; ++i) { + switch ((op >> 16) & 0x1f) { + case 0: // d[i] = s[i]; break; //vmov + // Probably for swizzle. + fpr.MapDirtyInV(tempregs[i], sregs[i]); + FMR(fpr.V(tempregs[i]), fpr.V(sregs[i])); + break; + case 1: // d[i] = fabsf(s[i]); break; //vabs + fpr.MapDirtyInV(tempregs[i], sregs[i]); + FABS(fpr.V(tempregs[i]), fpr.V(sregs[i])); + break; + case 2: // d[i] = -s[i]; break; //vneg + fpr.MapDirtyInV(tempregs[i], sregs[i]); + FNEG(fpr.V(tempregs[i]), fpr.V(sregs[i])); + break; + + /* These are probably just as broken as the prefix. + case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0 + fpr.MapDirtyInV(tempregs[i], sregs[i]); + MOVI2F(S0, 0.5f, R0); + VABS(S1, fpr.V(sregs[i])); // S1 = fabs(x) + VSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0); // S2 = fabs(x-0.5f) {VABD} + VABS(fpr.V(tempregs[i]), fpr.V(tempregs[i])); + VSUB(fpr.V(tempregs[i]), S1, fpr.V(tempregs[i])); // v[i] = S1 - S2 + 0.5f + VADD(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S0); + break; + case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1 + fpr.MapDirtyInV(tempregs[i], sregs[i]); + MOVI2F(S0, 1.0f, R0); + VABS(S1, fpr.V(sregs[i])); // S1 = fabs(x) + VSUB(fpr.V(tempregs[i]), fpr.V(sregs[i]), S0); // S2 = fabs(x-1.0f) {VABD} + VABS(fpr.V(tempregs[i]), fpr.V(tempregs[i])); + VSUB(fpr.V(tempregs[i]), S1, fpr.V(tempregs[i])); // v[i] = S1 - S2 + break; + */ + + case 16: // d[i] = 1.0f / s[i]; break; //vrcp + fpr.MapDirtyInV(tempregs[i], sregs[i]); + MOVI2F(FPR6, 1.0f); + FDIVS(fpr.V(tempregs[i]), FPR6, fpr.V(sregs[i])); + break; +#if !PPSSPP_ARCH(PPC750) + case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq + fpr.MapDirtyInV(tempregs[i], sregs[i]); + MOVI2F(FPR6, 1.0f); + FSQRTS(FPR7, fpr.V(sregs[i])); + FDIVS(fpr.V(tempregs[i]), FPR6, FPR7); + break; +#endif + case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin + DISABLE; + break; + case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos + DISABLE; + break; + case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2 + DISABLE; + break; + case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2 + DISABLE; + break; +#if !PPSSPP_ARCH(PPC750) + case 22: // d[i] = sqrtf(s[i]); break; //vsqrt + fpr.MapDirtyInV(tempregs[i], sregs[i]); + FSQRTS(fpr.V(tempregs[i]), fpr.V(sregs[i])); + FABS(fpr.V(tempregs[i]), fpr.V(tempregs[i])); + break; +#endif + case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin + DISABLE; + break; + case 24: // d[i] = -1.0f / s[i]; break; // vnrcp + fpr.MapDirtyInV(tempregs[i], sregs[i]); + MOVI2F(FPR6, -1.0f); + FDIVS(fpr.V(tempregs[i]), FPR6, fpr.V(sregs[i])); + break; + case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin + DISABLE; + break; + case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2 + DISABLE; + break; + default: + DISABLE; + break; + } + } + + for (int i = 0; i < n; ++i) { + if (dregs[i] != tempregs[i]) { + fpr.MapDirtyInV(dregs[i], tempregs[i]); + FMR(fpr.V(dregs[i]), fpr.V(tempregs[i])); + } + } + + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Mftv(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int imm = op & 0xFF; + MIPSGPReg rt = _RT; + switch ((op >> 21) & 0x1f) + { + case 3: //mfv / mfvc + // rt = 0, imm = 255 appears to be used as a CPU interlock by some games. + if (rt != 0) { + if (imm < 128) { //R(rt) = VI(imm); + fpr.FlushV(imm); + gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY); + LWZ(gpr.R(rt), CTXREG, fpr.GetMipsRegOffsetV(imm)); + } else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc + DISABLE; + // In case we have a saved prefix. + //FlushPrefixV(); + //gpr.BindToRegister(rt, false, true); + //MOV(32, gpr.R(rt), M(&mips_->vfpuCtrl[imm - 128])); + } else { + //ERROR - maybe need to make this value too an "interlock" value? + ERROR_LOG(CPU, "mfv - invalid register %i", imm); + } + } + break; + + case 7: // mtv + if (imm < 128) { + gpr.FlushR(rt); + fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT); + LFS(fpr.V(imm), CTXREG, gpr.GetMipsRegOffset(rt)); + } else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt); + gpr.MapReg(rt); + STW(gpr.R(rt), CTXREG, offsetof(MIPSState, vfpuCtrl) + 4 * (imm - 128)); + //gpr.BindToRegister(rt, true, false); + //MOV(32, M(&mips_->vfpuCtrl[imm - 128]), gpr.R(rt)); + + // TODO: Optimization if rt is Imm? + // Set these BEFORE disable! + if (imm - 128 == VFPU_CTRL_SPREFIX) { + js.prefixSFlag = PpcJitState::PREFIX_UNKNOWN; + } else if (imm - 128 == VFPU_CTRL_TPREFIX) { + js.prefixTFlag = PpcJitState::PREFIX_UNKNOWN; + } else if (imm - 128 == VFPU_CTRL_DPREFIX) { + js.prefixDFlag = PpcJitState::PREFIX_UNKNOWN; + } + } else { + //ERROR + _dbg_assert_msg_(0,"mtv - invalid register"); + } + break; + + default: + DISABLE; + } + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vmfvc(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int vs = _VS; + int imm = op & 0xFF; + if (imm >= 128 && imm < 128 + VFPU_CTRL_MAX) { + fpr.MapRegV(vs); + ADDI(SREG, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + (imm - 128) * 4); + LFS(fpr.V(vs), SREG, 0); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + } + + void Jit::Comp_Vmtvc(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + int vs = _VS; + int imm = op & 0xFF; + if (imm >= 128 && imm < 128 + VFPU_CTRL_MAX) { + fpr.MapRegV(vs); + ADDI(SREG, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + (imm - 128) * 4); + SFS(fpr.V(vs), SREG, 0); + fpr.ReleaseSpillLocksAndDiscardTemps(); + + if (imm - 128 == VFPU_CTRL_SPREFIX) { + js.prefixSFlag = PpcJitState::PREFIX_UNKNOWN; + } else if (imm - 128 == VFPU_CTRL_TPREFIX) { + js.prefixTFlag = PpcJitState::PREFIX_UNKNOWN; + } else if (imm - 128 == VFPU_CTRL_DPREFIX) { + js.prefixDFlag = PpcJitState::PREFIX_UNKNOWN; + } + } + } + + void Jit::Comp_Vmmov(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + // TODO: This probably ignores prefixes? + //if (js.MayHavePrefix()) { + // DISABLE; + //} + + if (_VS == _VD) { + // A lot of these in Wipeout... Just drop the instruction entirely. + return; + } + + MatrixSize sz = GetMtxSize(op); + int n = GetMatrixSide(sz); + + u8 sregs[16], dregs[16]; + GetMatrixRegs(sregs, sz, _VS); + GetMatrixRegs(dregs, sz, _VD); + + // Rough overlap check. + bool overlap = false; + if (GetMtx(_VS) == GetMtx(_VD)) { + // Potential overlap (guaranteed for 3x3 or more). + overlap = true; + } + + if (overlap) { + // Not so common, fallback. + DISABLE; + } else { + for (int a = 0; a < n; a++) { + for (int b = 0; b < n; b++) { + fpr.MapDirtyInV(dregs[a * 4 + b], sregs[a * 4 + b]); + FMR(fpr.V(dregs[a * 4 + b]), fpr.V(sregs[a * 4 + b])); + } + } + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + } + + void Jit::Comp_VScl(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], dregs[4], treg; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegs(&treg, V_Single, _VT); + GetVectorRegsPrefixD(dregs, sz, _VD); + + // Move to S0 early, so we don't have to worry about overlap with scale. + fpr.LoadToRegV(FPR6, treg); + + // For prefixes to work, we just have to ensure that none of the output registers spill + // and that there's no overlap. + MIPSReg tempregs[4]; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], i, n, sregs)) { + // Need to use temp regs + tempregs[i] = fpr.GetTempV(); + } else { + tempregs[i] = dregs[i]; + } + } + + // The meat of the function! + for (int i = 0; i < n; i++) { + fpr.MapDirtyInV(tempregs[i], sregs[i]); + FMULS(fpr.V(tempregs[i]), fpr.V(sregs[i]), FPR6); + } + + for (int i = 0; i < n; i++) { + // All must be mapped for prefixes to work. + if (dregs[i] != tempregs[i]) { + fpr.MapDirtyInV(dregs[i], tempregs[i]); + FMR(fpr.V(dregs[i]), fpr.V(tempregs[i])); + } + } + + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vmmul(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + // TODO: This probably ignores prefixes? + if (js.MayHavePrefix() || disablePrefixes) { + DISABLE; + } + + MatrixSize sz = GetMtxSize(op); + int n = GetMatrixSide(sz); + + u8 sregs[16], tregs[16], dregs[16]; + GetMatrixRegs(sregs, sz, _VS); + GetMatrixRegs(tregs, sz, _VT); + GetMatrixRegs(dregs, sz, _VD); + + // Rough overlap check. + bool overlap = false; + if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) { + // Potential overlap (guaranteed for 3x3 or more). + overlap = true; + } + + if (overlap) { + DISABLE; + } else { + for (int a = 0; a < n; a++) { + for (int b = 0; b < n; b++) { + fpr.MapInInV(sregs[b * 4], tregs[a * 4]); + FMULS(FPR6, fpr.V(sregs[b * 4]), fpr.V(tregs[a * 4])); + for (int c = 1; c < n; c++) { + fpr.MapInInV(sregs[b * 4 + c], tregs[a * 4 + c]); + FMADDS(FPR6, fpr.V(sregs[b * 4 + c]), fpr.V(tregs[a * 4 + c]), FPR6); + } + fpr.MapRegV(dregs[a * 4 + b], MAP_DIRTY | MAP_NOINIT); + FMR(fpr.V(dregs[a * 4 + b]), FPR6); + } + } + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + } + + void Jit::Comp_Vmscl(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vtfm(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + // TODO: This probably ignores prefixes? Or maybe uses D? + if (js.MayHavePrefix() || disablePrefixes) { + DISABLE; + } + + VectorSize sz = GetVecSize(op); + MatrixSize msz = GetMtxSize(op); + int n = GetNumVectorElements(sz); + int ins = (op >> 23) & 7; + + bool homogenous = false; + if (n == ins) { + n++; + sz = (VectorSize)((int)(sz) + 1); + msz = (MatrixSize)((int)(msz) + 1); + homogenous = true; + } + // Otherwise, n should already be ins + 1. + else if (n != ins + 1) { + DISABLE; + } + + u8 sregs[16], dregs[4], tregs[4]; + GetMatrixRegs(sregs, msz, _VS); + GetVectorRegs(tregs, sz, _VT); + GetVectorRegs(dregs, sz, _VD); + + // TODO: test overlap, optimize. + int tempregs[4]; + for (int i = 0; i < n; i++) { + fpr.MapInInV(sregs[i * 4], tregs[0]); + FMULS(FPR6, fpr.V(sregs[i * 4]), fpr.V(tregs[0])); + for (int k = 1; k < n; k++) { + if (!homogenous || k != n - 1) { + fpr.MapInInV(sregs[i * 4 + k], tregs[k]); + FMADDS(FPR6, fpr.V(sregs[i * 4 + k]), fpr.V(tregs[k]), FPR6); + } else { + fpr.MapRegV(sregs[i * 4 + k]); + FADDS(FPR6, FPR6, fpr.V(sregs[i * 4 + k])); + } + } + + int temp = fpr.GetTempV(); + fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY); + fpr.SpillLockV(temp); + FMR(fpr.V(temp), FPR6); + tempregs[i] = temp; + } + for (int i = 0; i < n; i++) { + u8 temp = tempregs[i]; + fpr.MapRegV(dregs[i], MAP_NOINIT | MAP_DIRTY); + FMR(fpr.V(dregs[i]), fpr.V(temp)); + } + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VHdp(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_VCrs(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_VDet(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vi2x(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vx2i(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vf2i(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vi2f(MIPSOpcode op) { +#if PPSSPP_ARCH(32BIT) + DISABLE; +#endif + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) + DISABLE; + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + int imm = (op >> 16) & 0x1f; + const float mult = 1.0f / (float)(1UL << imm); + + u8 sregs[4], dregs[4]; + GetVectorRegsPrefixS(sregs, sz, _VS); + GetVectorRegsPrefixD(dregs, sz, _VD); + + MIPSReg tempregs[4]; + for (int i = 0; i < n; ++i) { + if (!IsOverlapSafe(dregs[i], i, n, sregs)) { + tempregs[i] = fpr.GetTempV(); + } else { + tempregs[i] = dregs[i]; + } + } + + if (mult != 1.0f) + MOVI2F(FPR5, mult, false); + + u64 tmp = 0; + MOVI2R(SREG, (u32)&tmp); + + //Break(); + +#if PPSSPP_ARCH(64BIT) + for (int i = 0; i < n; i++) { + // Crappy code !! + fpr.MapDirtyInV(tempregs[i], sregs[i]); + + // float => mem + SFS(fpr.V(sregs[i]), SREG, 0); + + // int <= mem + LWZ(R6, SREG, 0); + //RLDICL(R6, R6, 0, 23); + EXTSW(R6, R6); + + // int => mem + STD(R6, SREG, 0); + + // float <= mem + LFD(fpr.V(tempregs[i]), SREG, 0); + + FCFID(fpr.V(tempregs[i]), fpr.V(tempregs[i])); + FRSP(fpr.V(tempregs[i]), fpr.V(tempregs[i])); + + if (mult != 1.0f) + FMULS(fpr.V(tempregs[i]), fpr.V(tempregs[i]), FPR5); + } +#else + // TODO: + Crash(); +#endif + + //Break(); + + for (int i = 0; i < n; ++i) { + if (dregs[i] != tempregs[i]) { + fpr.MapDirtyInV(dregs[i], tempregs[i]); + FMR(fpr.V(dregs[i]), fpr.V(tempregs[i])); + } + } + + ApplyPrefixD(dregs, sz); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vh2f(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vcst(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + int conNum = (op >> 16) & 0x1f; + int vd = _VD; + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 dregs[4]; + GetVectorRegsPrefixD(dregs, sz, _VD); + fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY); + + MOVI2R(SREG, (u32)(void *)&cst_constants[conNum]); + LFS(FPR6, SREG, 0); + for (int i = 0; i < n; ++i) + FMR(fpr.V(dregs[i]), FPR6); + + ApplyPrefixD(dregs, sz); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vhoriz(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_VRot(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_VIdt(MIPSOpcode op) { + CONDITIONAL_DISABLE + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + int vd = _VD; + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + MOVI2F(FPR6, 0.0f); + MOVI2F(FPR7, 1.0f); + u8 dregs[4]; + GetVectorRegsPrefixD(dregs, sz, _VD); + fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY); + switch (sz) + { + case V_Pair: + FMR(fpr.V(dregs[0]), (vd&1)==0 ? FPR7 : FPR6); + FMR(fpr.V(dregs[1]), (vd&1)==1 ? FPR7 : FPR6); + break; + case V_Quad: + FMR(fpr.V(dregs[0]), (vd&3)==0 ? FPR7 : FPR6); + FMR(fpr.V(dregs[1]), (vd&3)==1 ? FPR7 : FPR6); + FMR(fpr.V(dregs[2]), (vd&3)==2 ? FPR7 : FPR6); + FMR(fpr.V(dregs[3]), (vd&3)==3 ? FPR7 : FPR6); + break; + default: + _dbg_assert_msg_(0,"Trying to interpret instruction that can't be interpreted"); + break; + } + + ApplyPrefixD(dregs, sz); + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vcmp(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Vcmov(MIPSOpcode op) { + DISABLE; + } + + void Jit::Comp_Viim(MIPSOpcode op) { + DISABLE; + + u8 dreg; + GetVectorRegs(&dreg, V_Single, _VT); + + s32 imm = (s32)(s16)(u16)(op & 0xFFFF); + fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT); + MOVI2F(fpr.V(dreg), (float)imm); + + ApplyPrefixD(&dreg, V_Single); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_Vfim(MIPSOpcode op) { + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + u8 dreg; + GetVectorRegs(&dreg, V_Single, _VT); + + FP16 half; + half.u = op & 0xFFFF; + FP32 fval = half_to_float_fast5(half); + fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT); + MOVI2F(fpr.V(dreg), fval.f); + + ApplyPrefixD(&dreg, V_Single); + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + + void Jit::Comp_VCrossQuat(MIPSOpcode op) { + DISABLE; + CONDITIONAL_DISABLE; + + if (js.HasUnknownPrefix() || disablePrefixes) { + DISABLE; + } + + VectorSize sz = GetVecSize(op); + int n = GetNumVectorElements(sz); + + u8 sregs[4], tregs[4], dregs[4]; + GetVectorRegs(sregs, sz, _VS); + GetVectorRegs(tregs, sz, _VT); + GetVectorRegs(dregs, sz, _VD); + + // Map everything into registers. + fpr.MapRegsAndSpillLockV(sregs, sz, 0); + fpr.MapRegsAndSpillLockV(tregs, sz, 0); + + if (sz == V_Triple) { + int temp3 = fpr.GetTempV(); + fpr.MapRegV(temp3, MAP_DIRTY | MAP_NOINIT); + // Cross product vcrsp.t + + // Compute X + FMULS(FPR6, fpr.V(sregs[1]), fpr.V(tregs[2])); + FMSUBS(FPR6, fpr.V(sregs[2]), fpr.V(tregs[1]), FPR6); + + // Compute Y + FMULS(FPR7, fpr.V(sregs[2]), fpr.V(tregs[0])); + FMSUBS(FPR7, fpr.V(sregs[0]), fpr.V(tregs[2]), FPR7); + + // Compute Z + FMULS(fpr.V(temp3), fpr.V(sregs[0]), fpr.V(tregs[1])); + FMSUBS(fpr.V(temp3), fpr.V(sregs[1]), fpr.V(tregs[0]), fpr.V(temp3)); + + fpr.MapRegsAndSpillLockV(dregs, V_Triple, MAP_DIRTY | MAP_NOINIT); + FMR(fpr.V(dregs[0]), FPR6); + FMR(fpr.V(dregs[1]), FPR7); + FMR(fpr.V(dregs[2]), fpr.V(temp3)); + } else if (sz == V_Quad) { + // Quaternion product vqmul.q untested + DISABLE; + } + + fpr.ReleaseSpillLocksAndDiscardTemps(); + } + void Jit::Comp_Vsgn(MIPSOpcode op) { + DISABLE; + } + void Jit::Comp_Vocp(MIPSOpcode op) { + DISABLE; + } + void Jit::Comp_ColorConv(MIPSOpcode op) { + DISABLE; + } + void Jit::Comp_Vbfy(MIPSOpcode op) { + DISABLE; + } +} + +#endif diff --git a/Core/MIPS/PPC/PpcJit.cpp b/Core/MIPS/PPC/PpcJit.cpp new file mode 100644 index 000000000000..00eb8eac39dc --- /dev/null +++ b/Core/MIPS/PPC/PpcJit.cpp @@ -0,0 +1,346 @@ +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "profiler/profiler.h" +#include "Common/Serialize/Serializer.h" +#include "Core/Core.h" +#include "Core/CoreTiming.h" +#include "Core/MemMap.h" +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSCodeUtils.h" +#include "Core/MIPS/MIPSInt.h" +#include "Core/MIPS/MIPSTables.h" + +#include "PpcRegCache.h" +#include "ppcEmitter.h" +#include "PpcJit.h" + +#include +//#include + +using namespace PpcGen; + +extern volatile CoreState coreState; + +namespace MIPSComp +{ + +static u32 delaySlotFlagsValue; + +void Jit::CompileDelaySlot(int flags) +{ + // preserve flag around the delay slot! Maybe this is not always necessary on ARM where + // we can (mostly) control whether we set the flag or not. Of course, if someone puts an slt in to the + // delay slot, we're screwed. + if (flags & DELAYSLOT_SAFE) { + // Save flags register + MOVI2R(SREG, (u32)&delaySlotFlagsValue); + STW(FLAGREG, SREG); + MFCR(R19); + } + + js.inDelaySlot = true; + MIPSOpcode op = Memory::Read_Instruction(js.compilerPC + 4); + MIPSCompileOp(op, this); + js.inDelaySlot = false; + + if (flags & DELAYSLOT_FLUSH) + FlushAll(); + + if (flags & DELAYSLOT_SAFE) { + // Restore flags register + MOVI2R(SREG, (u32)&delaySlotFlagsValue); + LWZ(FLAGREG, SREG); + MTCR(R19); + } +} + +void Jit::Compile(u32 em_address) +{ + PROFILE_THIS_SCOPE("jitc"); + if (GetSpaceLeft() < 0x10000 || blocks.IsFull()) + { + ClearCache(); + } + + int block_num = blocks.AllocateBlock(em_address); + JitBlock *b = blocks.GetBlock(block_num); + DoJit(em_address, b); + blocks.FinalizeBlock(block_num, jo.enableBlocklink); + + // Drat. The VFPU hit an uneaten prefix at the end of a block. + if (js.startDefaultPrefix && js.MayHavePrefix()) + { + js.startDefaultPrefix = false; + // Our assumptions are all wrong so it's clean-slate time. + ClearCache(); + + // Let's try that one more time. We won't get back here because we toggled the value. + Compile(em_address); + } +} + +bool Jit::DescribeCodePtr(const u8 *ptr, std::string &name) +{ + // TODO: Not used by anything yet. + return false; +} + +void Jit::LinkBlock(u8 *exitPoint, const u8 *checkedEntry) { + if (PlatformIsWXExclusive()) { + ProtectMemoryPages(exitPoint, 32, MEM_PROT_READ | MEM_PROT_WRITE); + } + PPCXEmitter emit(exitPoint); + emit.B(checkedEntry); + emit.FlushIcache(); + if (PlatformIsWXExclusive()) { + ProtectMemoryPages(exitPoint, 32, MEM_PROT_READ | MEM_PROT_EXEC); + } +} + +void Jit::UnlinkBlock(u8 *checkedEntry, u32 originalAddress) { + if (PlatformIsWXExclusive()) { + ProtectMemoryPages(checkedEntry, 16, MEM_PROT_READ | MEM_PROT_WRITE); + } + PPCXEmitter emit(checkedEntry); + emit.MOVI2R(R3, originalAddress); + emit.STW(R0, CTXREG, offsetof(MIPSState, pc)); + emit.B(dispatcher); + emit.FlushIcache(); + if (PlatformIsWXExclusive()) { + ProtectMemoryPages(checkedEntry, 16, MEM_PROT_READ | MEM_PROT_EXEC); + } +} + +void Jit::MovFromPC(PPCReg r) { + LWZ(r, CTXREG, offsetof(MIPSState, pc)); +} + +void Jit::MovToPC(PPCReg r) { + STW(r, CTXREG, offsetof(MIPSState, pc)); +} + +void Jit::SaveDowncount(PPCReg r) { + STW(r, CTXREG, offsetof(MIPSState, downcount)); +} + +void Jit::RestoreDowncount(PPCReg r) { + LWZ(r, CTXREG, offsetof(MIPSState, downcount)); +} + +static void ShowDownCount() { + if (currentMIPS->downcount<0) { + //ERROR_LOG(DYNA_REC, "MIPSState, downcount %08x", currentMIPS->downcount); + Crash(); + } +} + +void Jit::WriteDownCount(int offset) +{ + // don't know if the result is correct + int theDowncount = js.downcountAmount + offset; + if (jo.downcountInRegister) { + // DCNTREG = DCNTREG - theDowncount; + MOVI2R(SREG, theDowncount); + SUBF(DCNTREG, SREG, DCNTREG, 1); + STW(DCNTREG, CTXREG, offsetof(MIPSState, downcount)); + } else { + // DCNTREG = MIPSState->downcount - theDowncount; + MOVI2R(SREG, theDowncount); + LWZ(DCNTREG, CTXREG, offsetof(MIPSState, downcount)); + SUBF(DCNTREG, SREG, DCNTREG, 1); + STW(DCNTREG, CTXREG, offsetof(MIPSState, downcount)); + } + + //QuickCallFunction(ShowDownCount); + + CMPI(DCNTREG, 0); +} + +void Jit::Comp_Generic(MIPSOpcode op) { + FlushAll(); + + // basic jit !! + MIPSInterpretFunc func = MIPSGetInterpretFunc(op); + if (func) + { + // Save mips PC and cycles + SaveDowncount(DCNTREG); + MOVI2R(SREG, js.compilerPC); + MovToPC(SREG); + + // call interpreted function + MOVI2R(R3, op.encoding); +#ifdef __wiiu__ + // R3 is expected to contain the address of the 4-byte MIPSOpcode struct ... + ADDI(R1, R1, -8); + STWU(R3, R1, 0); + MR(R3, R1); +#endif + QuickCallFunction((void *)func); +#ifdef __wiiu__ + ADDI(R1, R1, 8); +#endif + + // restore pc and cycles + RestoreDowncount(DCNTREG); + } + const MIPSInfo info = MIPSGetInfo(op); + if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) + { + // If it does eat them, it'll happen in MIPSCompileOp(). + if ((info & OUT_EAT_PREFIX) == 0) + js.PrefixUnknown(); + } +} + +void Jit::EatInstruction(MIPSOpcode op) { + MIPSInfo info = MIPSGetInfo(op); + _dbg_assert_msg_(!(info & DELAYSLOT), "Never eat a branch op."); + _dbg_assert_msg_(!js.inDelaySlot, "Never eat an instruction inside a delayslot."); + + js.compilerPC += 4; + js.downcountAmount += MIPSGetInstructionCycleEstimate(op); +} + +void Jit::Comp_RunBlock(MIPSOpcode op) { + // This shouldn't be necessary, the dispatcher should catch us before we get here. + ERROR_LOG(JIT, "Comp_RunBlock should never be reached!"); +} + +void Jit::Comp_ReplacementFunc(MIPSOpcode op) +{ + // None of the code of this function is relevant so we'll just + // call the replacement and move RA to PC. + int replacementFunc = op & 0xFFFFFFF; + + + // We could even do this in the jal that is branching to the function + // but having the op is necessary for the interpreter anyway. +} + +void Jit::Comp_DoNothing(MIPSOpcode op) { + +} + +void Jit::FlushAll() +{ + gpr.FlushAll(); + fpr.FlushAll(); + FlushPrefixV(); +} + +void Jit::FlushPrefixV() +{ + if ((js.prefixSFlag & PpcJitState::PREFIX_DIRTY) != 0) { + MOVI2R(SREG, js.prefixS); + STW(SREG, CTXREG, offsetof(MIPSState, vfpuCtrl[VFPU_CTRL_SPREFIX])); + js.prefixSFlag = (PpcJitState::PrefixState) (js.prefixSFlag & ~PpcJitState::PREFIX_DIRTY); + } + + if ((js.prefixTFlag & PpcJitState::PREFIX_DIRTY) != 0) { + MOVI2R(SREG, js.prefixT); + STW(SREG, CTXREG, offsetof(MIPSState, vfpuCtrl[VFPU_CTRL_TPREFIX])); + js.prefixTFlag = (PpcJitState::PrefixState) (js.prefixTFlag & ~PpcJitState::PREFIX_DIRTY); + } + + if ((js.prefixDFlag & PpcJitState::PREFIX_DIRTY) != 0) { + MOVI2R(SREG, js.prefixD); + STW(SREG, CTXREG, offsetof(MIPSState, vfpuCtrl[VFPU_CTRL_DPREFIX])); + js.prefixDFlag = (PpcJitState::PrefixState) (js.prefixDFlag & ~PpcJitState::PREFIX_DIRTY); + } +} + +void Jit::ClearCache() { + blocks.Clear(); + ClearCodeSpace(0); + GenerateFixedCode(); +} + +void Jit::InvalidateCache() { + blocks.Clear(); +} + +void Jit::InvalidateCacheAt(u32 em_address, int length) { + blocks.InvalidateICache(em_address, length); +} + +Jit::Jit(MIPSState *mips) : blocks(mips, this), gpr(mips, &jo),fpr(mips),mips_(mips) +{ + blocks.Init(); + gpr.SetEmitter(this); + fpr.SetEmitter(this); +#ifdef __wiiu__ + // we only have a little less than 8MB RWX memory available in the HBL environment. + AllocCodeSpace(1024 * 1024 * 7); +#else + AllocCodeSpace(1024 * 1024 * 16); +#endif + GenerateFixedCode(); + + js.startDefaultPrefix = true; +} + +void Jit::RunLoopUntil(u64 globalticks) { + PROFILE_THIS_SCOPE("jit"); +#ifdef _XBOX + // force stack alinement + //_alloca(16*1024); +#endif + + // Run the compiled code + ((void (*)())enterCode)(); +} + + +// IDEA - could have a WriteDualExit that takes two destinations and two condition flags, +// and just have conditional that set PC "twice". This only works when we fall back to dispatcher +// though, as we need to have the SUBS flag set in the end. So with block linking in the mix, +// I don't think this gives us that much benefit. +void Jit::WriteExit(u32 destination, int exit_num) +{ + WriteDownCount(); + //If nobody has taken care of this yet (this can be removed when all branches are done) + JitBlock *b = js.curBlock; + b->exitAddress[exit_num] = destination; + b->exitPtrs[exit_num] = GetWritableCodePtr(); + + // Link opportunity! + int block = blocks.GetBlockNumberFromStartAddress(destination); + if (block >= 0 && jo.enableBlocklink) { + // It exists! Joy of joy! + B(blocks.GetBlock(block)->checkedEntry); + b->linkStatus[exit_num] = true; + } else { + MOVI2R(SREG, destination); + B((const void *)dispatcherPCInR0); + } +} + +void Jit::WriteExitDestInR(PPCReg Reg) +{ + //Break(); + MovToPC(Reg); + WriteDownCount(); + // TODO: shouldn't need an indirect branch here... + B((const void *)dispatcher); +} + +void Jit::WriteSyscallExit() +{ + WriteDownCount(); + B((const void *)dispatcherCheckCoreState); +} + +MIPSOpcode Jit::GetOriginalOp(MIPSOpcode op) { + JitBlockCache *bc = GetBlockCache(); + int block_num = bc->GetBlockNumberFromEmuHackOp(op, true); + if (block_num >= 0) { + return bc->GetOriginalFirstOp(block_num); + } else { + return op; + } +} + +} +#endif diff --git a/Core/MIPS/PPC/PpcJit.h b/Core/MIPS/PPC/PpcJit.h new file mode 100644 index 000000000000..26816b170efd --- /dev/null +++ b/Core/MIPS/PPC/PpcJit.h @@ -0,0 +1,337 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include "Common/CommonTypes.h" +#include "Common/Serialize/Serializer.h" + +#include "Core/MIPS/JitCommon/JitBlockCache.h" +#include "Core/MIPS/JitCommon/JitState.h" +#include "Core/MIPS/JitCommon/JitCommon.h" +#include "Core/MIPS/PPC/PpcRegCache.h" +#include "Core/MIPS/PPC/PpcRegCacheFPU.h" + +#include "Core/MIPS/MIPS.h" + +#include + +namespace MIPSComp +{ + + struct PpcJitOptions + { + PpcJitOptions() + { + enableBlocklink = true; + downcountInRegister = true; + } + + bool enableBlocklink; + bool downcountInRegister; + }; + + struct PpcJitState + { + enum PrefixState + { + PREFIX_UNKNOWN = 0x00, + PREFIX_KNOWN = 0x01, + PREFIX_DIRTY = 0x10, + PREFIX_KNOWN_DIRTY = 0x11, + }; + + PpcJitState() + : prefixSFlag(PREFIX_UNKNOWN), + prefixTFlag(PREFIX_UNKNOWN), + prefixDFlag(PREFIX_UNKNOWN) {} + + u32 compilerPC; + u32 blockStart; + bool cancel; + bool inDelaySlot; + int downcountAmount; + bool compiling; // TODO: get rid of this in favor of using analysis results to determine end of block + JitBlock *curBlock; + + // VFPU prefix magic + bool startDefaultPrefix; + u32 prefixS; + u32 prefixT; + u32 prefixD; + PrefixState prefixSFlag; + PrefixState prefixTFlag; + PrefixState prefixDFlag; + void PrefixStart() { + if (startDefaultPrefix) { + EatPrefix(); + } else { + PrefixUnknown(); + } + } + void PrefixUnknown() { + prefixSFlag = PREFIX_UNKNOWN; + prefixTFlag = PREFIX_UNKNOWN; + prefixDFlag = PREFIX_UNKNOWN; + } + bool MayHavePrefix() const { + if (HasUnknownPrefix()) { + return true; + } else if (prefixS != 0xE4 || prefixT != 0xE4 || prefixD != 0) { + return true; + } else if (VfpuWriteMask() != 0) { + return true; + } + + return false; + } + bool HasUnknownPrefix() const { + if (!(prefixSFlag & PREFIX_KNOWN) || !(prefixTFlag & PREFIX_KNOWN) || !(prefixDFlag & PREFIX_KNOWN)) { + return true; + } + return false; + } + bool HasNoPrefix() const { + return (prefixDFlag & PREFIX_KNOWN) && (prefixSFlag & PREFIX_KNOWN) && (prefixTFlag & PREFIX_KNOWN) && (prefixS == 0xE4 && prefixT == 0xE4 && prefixD == 0); + } + + void EatPrefix() { + if ((prefixSFlag & PREFIX_KNOWN) == 0 || prefixS != 0xE4) { + prefixSFlag = PREFIX_KNOWN_DIRTY; + prefixS = 0xE4; + } + if ((prefixTFlag & PREFIX_KNOWN) == 0 || prefixT != 0xE4) { + prefixTFlag = PREFIX_KNOWN_DIRTY; + prefixT = 0xE4; + } + if ((prefixDFlag & PREFIX_KNOWN) == 0 || prefixD != 0x0 || VfpuWriteMask() != 0) { + prefixDFlag = PREFIX_KNOWN_DIRTY; + prefixD = 0x0; + } + } + u8 VfpuWriteMask() const { + _assert_(prefixDFlag & PREFIX_KNOWN); + return (prefixD >> 8) & 0xF; + } + bool VfpuWriteMask(int i) const { + _assert_(prefixDFlag & PREFIX_KNOWN); + return (prefixD >> (8 + i)) & 1; + } + }; + + class Jit: public PpcGen::PPCXCodeBlock, public JitInterface, public MIPSFrontendInterface { + protected: + JitBlockCache blocks; + public: + Jit(MIPSState *mips); + void DoState(PointerWrap &p) { + auto s = p.Section("Jit", 1); + if (!s) + return; + + // Do nothing + } + static void DoDummyState(PointerWrap &p) { + auto s = p.Section("Jit", 1); + if (!s) + return; + + // Do nothing + } + + // Compiled ops should ignore delay slots + // the compiler will take care of them by itself + // OR NOT + void Comp_Generic(MIPSOpcode op); + + void EatInstruction(MIPSOpcode op); + void Comp_RunBlock(MIPSOpcode op); + void Comp_ReplacementFunc(MIPSOpcode op); + + // TODO: Eat VFPU prefixes here. + void EatPrefix() { js.EatPrefix(); } + + // Ops + void Comp_ITypeMem(MIPSOpcode op); + void Comp_Cache(MIPSOpcode op); + + void Comp_RelBranch(MIPSOpcode op); + void Comp_RelBranchRI(MIPSOpcode op); + void Comp_FPUBranch(MIPSOpcode op); + void Comp_FPULS(MIPSOpcode op); + void Comp_FPUComp(MIPSOpcode op); + void Comp_Jump(MIPSOpcode op); + void Comp_JumpReg(MIPSOpcode op); + void Comp_Syscall(MIPSOpcode op); + void Comp_Break(MIPSOpcode op); + + void Comp_IType(MIPSOpcode op); + void Comp_RType2(MIPSOpcode op); + void Comp_RType3(MIPSOpcode op); + void Comp_ShiftType(MIPSOpcode op); + void Comp_Allegrex(MIPSOpcode op); + void Comp_Allegrex2(MIPSOpcode op); + void Comp_VBranch(MIPSOpcode op); + void Comp_MulDivType(MIPSOpcode op); + void Comp_Special3(MIPSOpcode op); + + void Comp_FPU3op(MIPSOpcode op); + void Comp_FPU2op(MIPSOpcode op); + void Comp_mxc1(MIPSOpcode op); + + void Comp_DoNothing(MIPSOpcode op); + + void Comp_SV(MIPSOpcode op); + void Comp_SVQ(MIPSOpcode op); + void Comp_VPFX(MIPSOpcode op); + void Comp_VVectorInit(MIPSOpcode op); + void Comp_VMatrixInit(MIPSOpcode op); + void Comp_VDot(MIPSOpcode op); + void Comp_VecDo3(MIPSOpcode op); + void Comp_VV2Op(MIPSOpcode op); + void Comp_Mftv(MIPSOpcode op); + void Comp_Vmfvc(MIPSOpcode op); + void Comp_Vmtvc(MIPSOpcode op); + void Comp_Vmmov(MIPSOpcode op); + void Comp_VScl(MIPSOpcode op); + void Comp_Vmmul(MIPSOpcode op); + void Comp_Vmscl(MIPSOpcode op); + void Comp_Vtfm(MIPSOpcode op); + void Comp_VHdp(MIPSOpcode op); + void Comp_VCrs(MIPSOpcode op); + void Comp_VDet(MIPSOpcode op); + void Comp_Vi2x(MIPSOpcode op); + void Comp_Vx2i(MIPSOpcode op); + void Comp_Vf2i(MIPSOpcode op); + void Comp_Vi2f(MIPSOpcode op); + void Comp_Vh2f(MIPSOpcode op); + void Comp_Vcst(MIPSOpcode op); + void Comp_Vhoriz(MIPSOpcode op); + void Comp_VRot(MIPSOpcode op); + void Comp_VIdt(MIPSOpcode op); + void Comp_Vcmp(MIPSOpcode op); + void Comp_Vcmov(MIPSOpcode op); + void Comp_Viim(MIPSOpcode op); + void Comp_Vfim(MIPSOpcode op); + void Comp_VCrossQuat(MIPSOpcode op); + void Comp_Vsgn(MIPSOpcode op); + void Comp_Vocp(MIPSOpcode op); + void Comp_ColorConv(MIPSOpcode op); + void Comp_Vbfy(MIPSOpcode op); + + int Replace_fabsf(); + + // Utility compilation functions + void BranchFPFlag(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely); + void BranchVFPUFlag(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely); + void BranchRSZeroComp(MIPSOpcode op, PpcGen::FixupBranchType cc, bool andLink, bool likely); + void BranchRSRTComp(MIPSOpcode op, PpcGen::FixupBranchType cc, bool likely); + + void SetRegToEffectiveAddress(PpcGen::PPCReg r, int rs, s16 offset); + + // Utilities to reduce duplicated code + void CompImmLogic(int rs, int rt, u32 uimm, void (PPCXEmitter::*arith)(PPCReg Rd, PPCReg Ra, unsigned short imm), u32 (*eval)(u32 a, u32 b)); + void CompType3(int rd, int rs, int rt, void (PPCXEmitter::*arithOp2)(PPCReg Rd, PPCReg Ra, PPCReg Rb), u32 (*eval)(u32 a, u32 b), bool isSub = false); + void FPUComp(int fs, int ft, PpcGen::FixupBranchType cond, bool unorderer = false, int bf = 0); + + void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz); + void ApplyPrefixD(const u8 *vregs, VectorSize sz); + void GetVectorRegsPrefixS(u8 *regs, VectorSize sz, int vectorReg) { + _assert_(js.prefixSFlag & PpcJitState::PREFIX_KNOWN); + GetVectorRegs(regs, sz, vectorReg); + ApplyPrefixST(regs, js.prefixS, sz); + } + void GetVectorRegsPrefixT(u8 *regs, VectorSize sz, int vectorReg) { + _assert_(js.prefixTFlag & PpcJitState::PREFIX_KNOWN); + GetVectorRegs(regs, sz, vectorReg); + ApplyPrefixST(regs, js.prefixT, sz); + } + void GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg); + + // flush regs + void FlushAll(); + void FlushPrefixV(); + + void WriteDownCount(int offset = 0); + void MovFromPC(PpcGen::PPCReg r); + void MovToPC(PpcGen::PPCReg r); + + void SaveDowncount(PpcGen::PPCReg r); + void RestoreDowncount(PpcGen::PPCReg r); + + void WriteExit(u32 destination, int exit_num); + void WriteExitDestInR(PPCReg Reg); + void WriteSyscallExit(); + + void ClearCache(); + void InvalidateCache(); + void InvalidateCacheAt(u32 em_address, int length = 4); + void UpdateFCR31() override {} + + const u8 *GetDispatcher() const override { return dispatcher; } + + void LinkBlock(u8 *exitPoint, const u8 *checkedEntry) override; + void UnlinkBlock(u8 *checkedEntry, u32 originalAddress) override; + + void RunLoopUntil(u64 globalticks); + void GenerateFixedCode(); + + void DumpJit(); + + void CompileDelaySlot(int flags); + void Compile(u32 em_address); // Compiles a block at current MIPS PC + const u8 *DoJit(u32 em_address, JitBlock *b); + + bool DescribeCodePtr(const u8 *ptr, std::string &name); + + const u8 *GetCrashHandler() const override { return crashHandler; } + bool CodeInRange(const u8 *ptr) const override { return IsInSpace(ptr); } + + PpcJitOptions jo; + PpcJitState js; + + PpcRegCache gpr; + PpcRegCacheFPU fpr; + + MIPSState *mips_; + + JitBlockCache *GetBlockCache() { return &blocks; } + JitBlockCacheDebugInterface *GetBlockCacheDebugInterface() override { return &blocks; } + + MIPSOpcode GetOriginalOp(MIPSOpcode op) override; + + std::vector SaveAndClearEmuHackOps() override { return blocks.SaveAndClearEmuHackOps(); } + void RestoreSavedEmuHackOps(std::vector saved) override { blocks.RestoreSavedEmuHackOps(saved); } + + private: + // Code pointers + const u8 *enterCode; + + const u8 *outerLoop; + const u8 *outerLoopPCInR0; + const u8 *dispatcherCheckCoreState; + const u8 *dispatcherPCInR0; + const u8 *dispatcher; + const u8 *dispatcherNoCheck; + + const u8 *breakpointBailout; + const u8 *crashHandler; + + }; + +} // namespace MIPSComp + diff --git a/Core/MIPS/PPC/PpcRegCache.cpp b/Core/MIPS/PPC/PpcRegCache.cpp new file mode 100644 index 000000000000..07e0f36c2a66 --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCache.cpp @@ -0,0 +1,318 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/ppcEmitter.h" +#include "PpcRegCache.h" +#include "PpcJit.h" + +using namespace PpcGen; +using namespace PpcJitConstants; + +PpcRegCache::PpcRegCache(MIPSState *mips, MIPSComp::PpcJitOptions *options) : mips_(mips), options_(options) { +} + +void PpcRegCache::Init(PPCXEmitter *emitter) { + emit_ = emitter; +} + +void PpcRegCache::Start(MIPSAnalyst::AnalysisResults &stats) { + for (int i = 0; i < NUM_PPCREG; i++) { + ar[i].mipsReg = -1; + ar[i].isDirty = false; + } + for (int i = 0; i < NUM_MIPSREG; i++) { + mr[i].loc = ML_MEM; + mr[i].reg = INVALID_REG; + mr[i].imm = -1; + mr[i].spillLock = false; + } +} + +const PPCReg *PpcRegCache::GetMIPSAllocationOrder(int &count) const { + // Note that R0 is reserved as scratch for now. + // R1 could be used as it's only used for scratch outside "regalloc space" now. + // R12 is also potentially usable. + // R4-R7 are registers we could use for static allocation or downcount. + // R8 is used to preserve flags in nasty branches. + // R9 and upwards are reserved for jit basics. + if (options_->downcountInRegister) { + static const PPCReg allocationOrder[] = { + /*R14, R15, R16, R17, R18, R19,*/ + R20, R21, R22, R23, R24, R25, + R26, R27, R28, R29, R30, R31, + }; + count = sizeof(allocationOrder) / sizeof(const int); + return allocationOrder; + } else { + static const PPCReg allocationOrder2[] = { + /*R14, R15, R16, R17, R18, R19,*/ + R20, R21, R22, R23, R24, R25, + R26, R27, R28, R29, R30, R31, + }; + count = sizeof(allocationOrder2) / sizeof(const int); + return allocationOrder2; + } +} + +void PpcRegCache::FlushBeforeCall() { + // R4-R11 are preserved. Others need flushing. + /* + FlushPpcReg(R2); + FlushPpcReg(R3); + FlushPpcReg(R12); + */ +} + +// TODO: Somewhat smarter spilling - currently simply spills the first available, should do +// round robin or FIFO or something. +PPCReg PpcRegCache::MapReg(MIPSReg mipsReg, int mapFlags) { + // Let's see if it's already mapped. If so we just need to update the dirty flag. + // We don't need to check for ML_NOINIT because we assume that anyone who maps + // with that flag immediately writes a "known" value to the register. + if (mr[mipsReg].loc == ML_PPCREG) { + if (ar[mr[mipsReg].reg].mipsReg != mipsReg) { + ERROR_LOG(JIT, "Register mapping out of sync! %i", mipsReg); + } + if (mapFlags & MAP_DIRTY) { + ar[mr[mipsReg].reg].isDirty = true; + } + return (PPCReg)mr[mipsReg].reg; + } + + // Okay, not mapped, so we need to allocate an ARM register. + + int allocCount; + const PPCReg *allocOrder = GetMIPSAllocationOrder(allocCount); + +allocate: + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i]; + + if (ar[reg].mipsReg == -1) { + // That means it's free. Grab it, and load the value into it (if requested). + ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false; + if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) { + if (mr[mipsReg].loc == ML_MEM) { + if (mipsReg != 0) { + emit_->LWZ((PPCReg)reg, CTXREG, GetMipsRegOffset(mipsReg)); + } else { + // If we get a request to load the zero register, at least we won't spend + // time on a memory access... + emit_->MOVI2R((PPCReg)reg, 0); + } + } else if (mr[mipsReg].loc == ML_IMM) { + emit_->MOVI2R((PPCReg)reg, mr[mipsReg].imm); + ar[reg].isDirty = true; // IMM is always dirty. + } + } + ar[reg].mipsReg = mipsReg; + mr[mipsReg].loc = ML_PPCREG; + mr[mipsReg].reg = (PPCReg)reg; + return (PPCReg)reg; + } + } + + // Still nothing. Let's spill a reg and goto 10. + // TODO: Use age or something to choose which register to spill? + // TODO: Spill dirty regs first? or opposite? + int bestToSpill = -1; + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i]; + if (ar[reg].mipsReg != -1 && mr[ar[reg].mipsReg].spillLock) + continue; + bestToSpill = reg; + break; + } + + if (bestToSpill != -1) { + // ERROR_LOG(JIT, "Out of registers at PC %08x - spills register %i.", mips_->pc, bestToSpill); + FlushPpcReg((PPCReg)bestToSpill); + goto allocate; + } + + // Uh oh, we have all them spilllocked.... + ERROR_LOG(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc); + return INVALID_REG; +} + +void PpcRegCache::MapInIn(MIPSReg rd, MIPSReg rs) { + SpillLock(rd, rs); + MapReg(rd); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCache::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) { + SpillLock(rd, rs); + bool load = !avoidLoad || rd == rs; + MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCache::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) { + SpillLock(rd, rs, rt); + bool load = !avoidLoad || (rd == rs || rd == rt); + MapReg(rd, load ? MAP_DIRTY : MAP_NOINIT); + MapReg(rt); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCache::MapDirtyDirtyInIn(MIPSReg rd1, MIPSReg rd2, MIPSReg rs, MIPSReg rt, bool avoidLoad) { + SpillLock(rd1, rd2, rs, rt); + bool load1 = !avoidLoad || (rd1 == rs || rd1 == rt); + bool load2 = !avoidLoad || (rd2 == rs || rd2 == rt); + MapReg(rd1, load1 ? MAP_DIRTY : MAP_NOINIT); + MapReg(rd2, load2 ? MAP_DIRTY : MAP_NOINIT); + MapReg(rt); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCache::FlushPpcReg(PPCReg r) { + if (ar[r].mipsReg == -1) { + // Nothing to do, reg not mapped. + return; + } + if (ar[r].mipsReg != -1) { + if (ar[r].isDirty && mr[ar[r].mipsReg].loc == ML_PPCREG) + emit_->STW(r, CTXREG, GetMipsRegOffset(ar[r].mipsReg)); + // IMMs won't be in an ARM reg. + mr[ar[r].mipsReg].loc = ML_MEM; + mr[ar[r].mipsReg].reg = INVALID_REG; + mr[ar[r].mipsReg].imm = 0; + } else { + ERROR_LOG(JIT, "Dirty but no mipsreg?"); + } + ar[r].isDirty = false; + ar[r].mipsReg = -1; +} + +void PpcRegCache::FlushR(MIPSReg r) { + switch (mr[r].loc) { + case ML_IMM: + // IMM is always "dirty". + emit_->MOVI2R(SREG, mr[r].imm); + emit_->STW(SREG, CTXREG, GetMipsRegOffset(r)); + break; + + case ML_PPCREG: + if (mr[r].reg == INVALID_REG) { + ERROR_LOG(JIT, "FlushMipsReg: MipsReg had bad PpcReg"); + } + if (ar[mr[r].reg].isDirty) { + emit_->STW((PPCReg)mr[r].reg, CTXREG, GetMipsRegOffset(r)); + ar[mr[r].reg].isDirty = false; + } + ar[mr[r].reg].mipsReg = -1; + break; + + case ML_MEM: + // Already there, nothing to do. + break; + + default: + //BAD + break; + } + mr[r].loc = ML_MEM; + mr[r].reg = INVALID_REG; + mr[r].imm = 0; +} + +void PpcRegCache::FlushAll() { + for (int i = 0; i < NUM_MIPSREG; i++) { + FlushR(i); + } + // Sanity check + for (int i = 0; i < NUM_PPCREG; i++) { + if (ar[i].mipsReg != -1) { + ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg); + } + } +} + +void PpcRegCache::SetImm(MIPSReg r, u32 immVal) { + if (r == 0) + ERROR_LOG(JIT, "Trying to set immediate %08x to r0", immVal); + + // Zap existing value if cached in a reg + if (mr[r].loc == ML_PPCREG) { + ar[mr[r].reg].mipsReg = -1; + ar[mr[r].reg].isDirty = false; + } + mr[r].loc = ML_IMM; + mr[r].imm = immVal; + mr[r].reg = INVALID_REG; +} + +bool PpcRegCache::IsImm(MIPSReg r) const { + if (r == 0) return true; + return mr[r].loc == ML_IMM; +} + +u32 PpcRegCache::GetImm(MIPSReg r) const { + if (r == 0) return 0; + if (mr[r].loc != ML_IMM) { + ERROR_LOG(JIT, "Trying to get imm from non-imm register %i", r); + } + return mr[r].imm; +} + +int PpcRegCache::GetMipsRegOffset(MIPSReg r) { + if (r < 32) + return r * 4; + switch (r) { + case MIPSREG_HI: + return offsetof(MIPSState, hi); + case MIPSREG_LO: + return offsetof(MIPSState, lo); + } + ERROR_LOG(JIT, "bad mips register %i", r); + return 0; // or what? +} + +void PpcRegCache::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) { + mr[r1].spillLock = true; + if (r2 != -1) mr[r2].spillLock = true; + if (r3 != -1) mr[r3].spillLock = true; + if (r4 != -1) mr[r4].spillLock = true; +} + +void PpcRegCache::ReleaseSpillLocks() { + for (int i = 0; i < NUM_MIPSREG; i++) { + mr[i].spillLock = false; + } +} + +void PpcRegCache::ReleaseSpillLock(MIPSReg reg) { + mr[reg].spillLock = false; +} + +PPCReg PpcRegCache::R(int mipsReg) { + if (mr[mipsReg].loc == ML_PPCREG) { + return (PPCReg)mr[mipsReg].reg; + } else { + ERROR_LOG(JIT, "Reg %i not in ppc reg. compilerPC = %08x", mipsReg, compilerPC_); + return INVALID_REG; // BAAAD + } +} +#endif diff --git a/Core/MIPS/PPC/PpcRegCache.h b/Core/MIPS/PPC/PpcRegCache.h new file mode 100644 index 000000000000..8259444348ef --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCache.h @@ -0,0 +1,161 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + + +/** +PPC reg cache based on arm version +**/ + +#pragma once + +#include "../MIPS.h" +#include "../MIPSAnalyst.h" +#include "ppcEmitter.h" + +using namespace PpcGen; + +// R2 to R8: mapped MIPS regs +// R9 = code pointers +// R10 = MIPS context +// R11 = base pointer + + +// R18 to R31: mapped MIPS regs +// R14 = MIPS context +// R15 = downcount register +// R16 = code pointer +// R17 = base pointer + +#if 1 +#define CTXREG (R14) +#define DCNTREG (R15) +#define CODEREG (R16) +#define BASEREG (R17) +#else +#define CTXREG (R6) +#define DCNTREG (R7) +#define CODEREG (R8) +#define BASEREG (R9) +#endif + + +// Safe to use this as scratch regs ? +#define SREG (R5) +#define FLAGREG (R18) + +namespace PpcJitConstants { + +// Special MIPS registers: +enum { + MIPSREG_HI = 32, + MIPSREG_LO = 33, + TOTAL_MAPPABLE_MIPSREGS = 34, +}; + +enum RegMIPSLoc { + ML_IMM, + ML_PPCREG, + ML_MEM, +}; + +#undef MAP_DIRTY +#undef MAP_NOINIT +// Initing is the default so the flag is reversed. +enum { + MAP_DIRTY = 1, + MAP_NOINIT = 2 | MAP_DIRTY, +}; + +} + +typedef int MIPSReg; + +struct RegPPC { + int mipsReg; // if -1, no mipsreg attached. + bool isDirty; // Should the register be written back? +}; + + +struct RegMIPS { + // Where is this MIPS register? + PpcJitConstants::RegMIPSLoc loc; + // Data (only one of these is used, depending on loc. Could make a union). + u32 imm; + PPCReg reg; // reg index + bool spillLock; // if true, this register cannot be spilled. + // If loc == ML_MEM, it's back in its location in the CPU context struct. +}; + +namespace MIPSComp { + struct PpcJitOptions; +} + +class PpcRegCache +{ +public: + PpcRegCache(MIPSState *mips, MIPSComp::PpcJitOptions *options); + ~PpcRegCache() {} + + void Init(PPCXEmitter *emitter); + void Start(MIPSAnalyst::AnalysisResults &stats); + + // Protect the arm register containing a MIPS register from spilling, to ensure that + // it's being kept allocated. + void SpillLock(MIPSReg reg, MIPSReg reg2 = -1, MIPSReg reg3 = -1, MIPSReg reg4 = -1); + void ReleaseSpillLock(MIPSReg reg); + void ReleaseSpillLocks(); + + void SetImm(MIPSReg reg, u32 immVal); + bool IsImm(MIPSReg reg) const; + u32 GetImm(MIPSReg reg) const; + + // Returns an ARM register containing the requested MIPS register. + PPCReg MapReg(MIPSReg reg, int mapFlags = 0); + void MapInIn(MIPSReg rd, MIPSReg rs); + void MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad = true); + void MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad = true); + void MapDirtyDirtyInIn(MIPSReg rd1, MIPSReg rd2, MIPSReg rs, MIPSReg rt, bool avoidLoad = true); + void FlushPpcReg(PPCReg r); + void FlushR(MIPSReg r); + void FlushBeforeCall(); + void FlushAll(); + + PPCReg R(int preg); // Returns a cached register + + void SetEmitter(PPCXEmitter *emitter) { emit_ = emitter; } + + // For better log output only. + void SetCompilerPC(u32 compilerPC) { compilerPC_ = compilerPC; } + + int GetMipsRegOffset(MIPSReg r); + +private: + const PPCReg *GetMIPSAllocationOrder(int &count) const; + + MIPSState *mips_; + MIPSComp::PpcJitOptions *options_; + PPCXEmitter *emit_; + u32 compilerPC_; + + enum { + NUM_PPCREG = 32, + NUM_MIPSREG = PpcJitConstants::TOTAL_MAPPABLE_MIPSREGS, + }; + + RegPPC ar[NUM_MIPSREG]; + RegMIPS mr[NUM_MIPSREG]; +}; diff --git a/Core/MIPS/PPC/PpcRegCacheFPU.cpp b/Core/MIPS/PPC/PpcRegCacheFPU.cpp new file mode 100644 index 000000000000..325c536f47b7 --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCacheFPU.cpp @@ -0,0 +1,393 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/ppcEmitter.h" +#include "Common/CPUDetect.h" +#include "Core/MIPS/PPC/PpcRegCacheFPU.h" +#include "Core/MIPS/MIPSDebugInterface.h" + + +using namespace PpcGen; +using namespace PpcJitConstants; + + +PpcRegCacheFPU::PpcRegCacheFPU(MIPSState *mips) : mips_(mips), vr(mr + 32) { +} + +void PpcRegCacheFPU::Init(PPCXEmitter *emitter) { + emit_ = emitter; +} + +void PpcRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) { + for (int i = 0; i < NUM_PPCFPUREG; i++) { + ar[i].mipsReg = -1; + ar[i].isDirty = false; + } + for (int i = 0; i < NUM_MIPSFPUREG; i++) { + mr[i].loc = ML_MEM; + mr[i].reg = INVALID_REG; + mr[i].spillLock = false; + mr[i].tempLock = false; + } +} + +static const PPCReg *GetMIPSAllocationOrder(int &count) { + // We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things. + static const PPCReg allocationOrder[] = { + FPR14, FPR15, FPR16, FPR17, + FPR18, FPR19, FPR20, FPR21, + FPR22, FPR23, FPR24, FPR25, + FPR26, FPR27, FPR28, FPR29, + FPR30, FPR31 + }; + + count = sizeof(allocationOrder) / sizeof(const int); + return allocationOrder; +} + +PPCReg PpcRegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) { + // Let's see if it's already mapped. If so we just need to update the dirty flag. + // We don't need to check for ML_NOINIT because we assume that anyone who maps + // with that flag immediately writes a "known" value to the register. + if (mr[mipsReg].loc == ML_PPCREG) { + if (ar[mr[mipsReg].reg].mipsReg != mipsReg) { + ERROR_LOG(HLE, "Register mapping out of sync! %i", mipsReg); + } + if (mapFlags & MAP_DIRTY) { + ar[mr[mipsReg].reg].isDirty = true; + } + //INFO_LOG(HLE, "Already mapped %i to %i", mipsReg, mr[mipsReg].reg); + return (PPCReg)(mr[mipsReg].reg + FPR0); + } + + // Okay, not mapped, so we need to allocate an PPC register. + + int allocCount; + const PPCReg *allocOrder = GetMIPSAllocationOrder(allocCount); + +allocate: + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i] - FPR0; + + if (ar[reg].mipsReg == -1) { + // That means it's free. Grab it, and load the value into it (if requested). + ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false; + if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) { + if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) { + emit_->LFS((PPCReg)(reg + FPR0), CTXREG, GetMipsRegOffset(mipsReg)); + } + } + ar[reg].mipsReg = mipsReg; + mr[mipsReg].loc = ML_PPCREG; + mr[mipsReg].reg = reg; + //INFO_LOG(HLE, "Mapped %i to %i", mipsReg, mr[mipsReg].reg); + return (PPCReg)(reg + FPR0); + } + } + + + // Still nothing. Let's spill a reg and goto 10. + // TODO: Use age or something to choose which register to spill? + // TODO: Spill dirty regs first? or opposite? + int bestToSpill = -1; + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i] - FPR0; + if (ar[reg].mipsReg != -1 && (mr[ar[reg].mipsReg].spillLock || mr[ar[reg].mipsReg].tempLock)) + continue; + bestToSpill = reg; + break; + } + + if (bestToSpill != -1) { + FlushPpcReg((PPCReg)(FPR0 + bestToSpill)); + goto allocate; + } + + // Uh oh, we have all them spilllocked.... + ERROR_LOG(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc); + return INVALID_REG; +} + +void PpcRegCacheFPU::MapInIn(MIPSReg rd, MIPSReg rs) { + SpillLock(rd, rs); + MapReg(rd); + MapReg(rs); + ReleaseSpillLock(rd); + ReleaseSpillLock(rs); +} + +void PpcRegCacheFPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) { + SpillLock(rd, rs); + bool overlap = avoidLoad && rd == rs; + MapReg(rd, MAP_DIRTY | (overlap ? 0 : MAP_NOINIT)); + MapReg(rs); + ReleaseSpillLock(rd); + ReleaseSpillLock(rs); +} + +void PpcRegCacheFPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) { + SpillLock(rd, rs, rt); + bool overlap = avoidLoad && (rd == rs || rd == rt); + MapReg(rd, MAP_DIRTY | (overlap ? 0 : MAP_NOINIT)); + MapReg(rt); + MapReg(rs); + ReleaseSpillLock(rd); + ReleaseSpillLock(rs); + ReleaseSpillLock(rt); +} + +void PpcRegCacheFPU::SpillLockV(const u8 *v, VectorSize sz) { + for (int i = 0; i < GetNumVectorElements(sz); i++) { + vr[v[i]].spillLock = true; + } +} + +void PpcRegCacheFPU::SpillLockV(int vec, VectorSize sz) { + u8 v[4]; + GetVectorRegs(v, sz, vec); + SpillLockV(v, sz); +} + +void PpcRegCacheFPU::MapRegV(int vreg, int flags) { + MapReg(vreg + 32, flags); +} + +void PpcRegCacheFPU::LoadToRegV(PPCReg ppcReg, int vreg) { + if (vr[vreg].loc == ML_PPCREG) { + emit_->FMR(ppcReg, (PPCReg)(FPR0 + vr[vreg].reg)); + } else { + MapRegV(vreg); + emit_->FMR(ppcReg, V(vreg)); + } +} + +void PpcRegCacheFPU::MapRegsAndSpillLockV(int vec, VectorSize sz, int flags) { + u8 v[4]; + GetVectorRegs(v, sz, vec); + SpillLockV(v, sz); + for (int i = 0; i < GetNumVectorElements(sz); i++) { + MapRegV(v[i], flags); + } +} + +void PpcRegCacheFPU::MapRegsAndSpillLockV(const u8 *v, VectorSize sz, int flags) { + SpillLockV(v, sz); + for (int i = 0; i < GetNumVectorElements(sz); i++) { + MapRegV(v[i], flags); + } +} + +void PpcRegCacheFPU::MapInInV(int vs, int vt) { + SpillLockV(vs); + SpillLockV(vt); + MapRegV(vs); + MapRegV(vt); + ReleaseSpillLockV(vs); + ReleaseSpillLockV(vt); +} + +void PpcRegCacheFPU::MapDirtyInV(int vd, int vs, bool avoidLoad) { + bool overlap = avoidLoad && (vd == vs); + SpillLockV(vd); + SpillLockV(vs); + MapRegV(vd, MAP_DIRTY | (overlap ? 0 : MAP_NOINIT)); + MapRegV(vs); + ReleaseSpillLockV(vd); + ReleaseSpillLockV(vs); +} + +void PpcRegCacheFPU::MapDirtyInInV(int vd, int vs, int vt, bool avoidLoad) { + bool overlap = avoidLoad && ((vd == vs) || (vd == vt)); + SpillLockV(vd); + SpillLockV(vs); + SpillLockV(vt); + MapRegV(vd, MAP_DIRTY | (overlap ? 0 : MAP_NOINIT)); + MapRegV(vs); + MapRegV(vt); + ReleaseSpillLockV(vd); + ReleaseSpillLockV(vs); + ReleaseSpillLockV(vt); +} + +void PpcRegCacheFPU::FlushPpcReg(PPCReg r) { + int reg = r - FPR0; + if (ar[reg].mipsReg == -1) { + // Nothing to do, reg not mapped. + return; + } + if (ar[reg].mipsReg != -1) { + if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_PPCREG) + { + //INFO_LOG(HLE, "Flushing PPC reg %i", reg); + emit_->SFS(r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg)); + } + // IMMs won't be in an PPC reg. + mr[ar[reg].mipsReg].loc = ML_MEM; + mr[ar[reg].mipsReg].reg = INVALID_REG; + } else { + ERROR_LOG(HLE, "Dirty but no mipsreg?"); + } + ar[reg].isDirty = false; + ar[reg].mipsReg = -1; +} + +void PpcRegCacheFPU::FlushR(MIPSReg r) { + switch (mr[r].loc) { + case ML_IMM: + // IMM is always "dirty". + // IMM is not allowed for FP (yet). + ERROR_LOG(HLE, "Imm in FP register?"); + break; + + case ML_PPCREG: + if (mr[r].reg == (int)INVALID_REG) { + ERROR_LOG(HLE, "FlushR: MipsReg had bad PpcReg"); + } + if (ar[mr[r].reg].isDirty) { + //INFO_LOG(HLE, "Flushing dirty reg %i", mr[r].reg); + emit_->SFS((PPCReg)(mr[r].reg + FPR0), CTXREG, GetMipsRegOffset(r)); + ar[mr[r].reg].isDirty = false; + } + ar[mr[r].reg].mipsReg = -1; + break; + + case ML_MEM: + // Already there, nothing to do. + break; + + default: + //BAD + break; + } + mr[r].loc = ML_MEM; + mr[r].reg = (int)INVALID_REG; +} + +void PpcRegCacheFPU::DiscardR(MIPSReg r) { + switch (mr[r].loc) { + case ML_IMM: + // IMM is always "dirty". + // IMM is not allowed for FP (yet). + ERROR_LOG(HLE, "Imm in FP register?"); + break; + + case ML_PPCREG: + if (mr[r].reg == (int)INVALID_REG) { + ERROR_LOG(HLE, "DiscardR: MipsReg had bad PpcReg"); + } + // Note that we DO NOT write it back here. That's the whole point of Discard. + ar[mr[r].reg].isDirty = false; + ar[mr[r].reg].mipsReg = -1; + break; + + case ML_MEM: + // Already there, nothing to do. + break; + + default: + //BAD + break; + } + mr[r].loc = ML_MEM; + mr[r].reg = (int)INVALID_REG; + mr[r].tempLock = false; + mr[r].spillLock = false; +} + + +bool PpcRegCacheFPU::IsTempX(PPCReg r) const { + return ar[r - FPR0].mipsReg >= TEMP0; +} + +int PpcRegCacheFPU::GetTempR() { + for (int r = TEMP0; r < TEMP0 + NUM_TEMPS; ++r) { + if (mr[r].loc == ML_MEM && !mr[r].tempLock) { + mr[r].tempLock = true; + return r; + } + } + + ERROR_LOG(CPU, "Out of temp regs! Might need to DiscardR() some"); + _assert_msg_(0, "Regcache ran out of temp regs, might need to DiscardR() some."); + return -1; +} + + +void PpcRegCacheFPU::FlushAll() { + // Discard temps! + for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) { + DiscardR(i); + } + for (int i = 0; i < NUM_MIPSFPUREG; i++) { + FlushR(i); + } + // Sanity check + for (int i = 0; i < NUM_PPCFPUREG; i++) { + if (ar[i].mipsReg != -1) { + ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg); + } + } +} + +int PpcRegCacheFPU::GetMipsRegOffset(MIPSReg r) { + // These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs", then the VFPU ctrls. + if (r < 0 || r > 32 + 128 + NUM_TEMPS) { + ERROR_LOG(JIT, "bad mips register %i, out of range", r); + return 0; // or what? + } + + if (r < 32 || r > 32 + 128) { + return (32 + r) << 2; + } else { + // r is between 32 and 128 + 32 + return (32 + 32 + voffset[r - 32]) << 2; + } +} + +void PpcRegCacheFPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) { + mr[r1].spillLock = true; + if (r2 != -1) mr[r2].spillLock = true; + if (r3 != -1) mr[r3].spillLock = true; + if (r4 != -1) mr[r4].spillLock = true; +} + +// This is actually pretty slow with all the 160 regs... +void PpcRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() { + for (int i = 0; i < NUM_MIPSFPUREG; i++) + mr[i].spillLock = false; + for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; ++i) + DiscardR(i); +} + +PPCReg PpcRegCacheFPU::R(int mipsReg) { + if (mr[mipsReg].loc == ML_PPCREG) { + return (PPCReg)(mr[mipsReg].reg + FPR0); + } else { + if (mipsReg < 32) { + ERROR_LOG(JIT, "FReg %i not in PPC reg. compilerPC = %08x : %s", mipsReg, compilerPC_, currentDebugMIPS->disasm(compilerPC_, 0)); + } else if (mipsReg < 32 + 128) { + ERROR_LOG(JIT, "VReg %i not in PPC reg. compilerPC = %08x : %s", mipsReg - 32, compilerPC_, currentDebugMIPS->disasm(compilerPC_, 0)); + } else { + ERROR_LOG(JIT, "Tempreg %i not in PPC reg. compilerPC = %08x : %s", mipsReg - 128 - 32, compilerPC_, currentDebugMIPS->disasm(compilerPC_, 0)); + } + return INVALID_REG; // BAAAD + } +} +#endif diff --git a/Core/MIPS/PPC/PpcRegCacheFPU.h b/Core/MIPS/PPC/PpcRegCacheFPU.h new file mode 100644 index 000000000000..c9535bf8ba8e --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCacheFPU.h @@ -0,0 +1,143 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#pragma once + +#include "Core/MIPS/MIPS.h" +#include "Core/MIPS/MIPSAnalyst.h" +#include "Common/ppcEmitter.h" +#include "Core/MIPS/PPC/PpcRegCache.h" +#include "Core/MIPS/MIPSVFPUUtils.h" + +using namespace PpcGen; + +namespace PpcJitConstants { + +enum { + NUM_TEMPS = 16, + TEMP0 = 32 + 128, + TOTAL_MAPPABLE_MIPSFPUREGS = 32 + 128 + NUM_TEMPS, +}; + +} + +struct FPURegPPC { + int mipsReg; // if -1, no mipsreg attached. + bool isDirty; // Should the register be written back? +}; + +struct FPURegMIPS { + // Where is this MIPS register? + PpcJitConstants::RegMIPSLoc loc; + // Data (only one of these is used, depending on loc. Could make a union). + int reg; + bool spillLock; // if true, this register cannot be spilled. + bool tempLock; + // If loc == ML_MEM, it's back in its location in the CPU context struct. +}; + +class PpcRegCacheFPU +{ +public: + PpcRegCacheFPU(MIPSState *mips); + ~PpcRegCacheFPU() {} + + void Init(PPCXEmitter *emitter); + void Start(MIPSAnalyst::AnalysisResults &stats); + + // Protect the ppc register containing a MIPS register from spilling, to ensure that + // it's being kept allocated. + void SpillLock(MIPSReg reg, MIPSReg reg2 = -1, MIPSReg reg3 = -1, MIPSReg reg4 = -1); + void SpillLockV(MIPSReg r) { SpillLock(r + 32); } + + void ReleaseSpillLocksAndDiscardTemps(); + void ReleaseSpillLock(int mipsreg) + { + mr[mipsreg].spillLock = false; + } + void ReleaseSpillLockV(int mipsreg) { + ReleaseSpillLock(mipsreg + 32); + } + + void SetImm(MIPSReg reg, u32 immVal); + bool IsImm(MIPSReg reg) const; + u32 GetImm(MIPSReg reg) const; + + // Returns an PPC register containing the requested MIPS register. + PPCReg MapReg(MIPSReg reg, int mapFlags = 0); + void MapInIn(MIPSReg rd, MIPSReg rs); + void MapInInV(int rt, int rs); + void MapDirtyInV(int rd, int rs, bool avoidLoad = true); + void MapDirtyInInV(int rd, int rs, int rt, bool avoidLoad = true); + void MapDirty(MIPSReg rd); + void MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad = true); + void MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad = true); + void FlushPpcReg(PPCReg r); + void FlushR(MIPSReg r); + void FlushV(MIPSReg r) { FlushR(r + 32); } + void DiscardR(MIPSReg r); + void DiscardV(MIPSReg r) { DiscardR(r + 32);} + bool IsTempX(PPCReg r) const; + + MIPSReg GetTempR(); + MIPSReg GetTempV() { return GetTempR() - 32; } + + void FlushAll(); + + PPCReg R(int preg); // Returns a cached register + + // VFPU registers + + PPCReg V(int vreg) { return R(vreg + 32); } + + void MapRegV(int vreg, int flags = 0); + + void LoadToRegV(PPCReg ppcReg, int vreg); + + // NOTE: These require you to release spill locks manually! + void MapRegsAndSpillLockV(int vec, VectorSize vsz, int flags); + void MapRegsAndSpillLockV(const u8 *v, VectorSize vsz, int flags); + + void SpillLockV(const u8 *v, VectorSize vsz); + void SpillLockV(int vec, VectorSize vsz); + + void SetEmitter(PPCXEmitter *emitter) { emit_ = emitter; } + + // For better log output only. + void SetCompilerPC(u32 compilerPC) { compilerPC_ = compilerPC; } + + int GetMipsRegOffset(MIPSReg r); + int GetMipsRegOffsetV(MIPSReg r) { + return GetMipsRegOffset(r + 32); + } + +private: + MIPSState *mips_; + PPCXEmitter *emit_; + u32 compilerPC_; + + enum { + NUM_PPCFPUREG = 32, + NUM_MIPSFPUREG = PpcJitConstants::TOTAL_MAPPABLE_MIPSFPUREGS, + }; + + RegPPC ar[NUM_PPCFPUREG]; + FPURegMIPS mr[NUM_MIPSFPUREG]; + FPURegMIPS *vr; +}; diff --git a/Core/MIPS/PPC/PpcRegCacheVPU.cpp b/Core/MIPS/PPC/PpcRegCacheVPU.cpp new file mode 100644 index 000000000000..5164449ba804 --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCacheVPU.cpp @@ -0,0 +1,318 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" +#if PPSSPP_ARCH(POWERPC) + +#include "Common/ppcEmitter.h" +#include "PpcRegCacheVPU.h" +#include "PpcJit.h" + +using namespace PpcGen; +using namespace PpcJitConstants; + +PpcRegCacheVPU::PpcRegCacheVPU(MIPSState *mips, MIPSComp::PpcJitOptions *options) : mips_(mips), options_(options) { +} + +void PpcRegCacheVPU::Init(PPCXEmitter *emitter) { + emit_ = emitter; +} + +void PpcRegCacheVPU::Start(MIPSAnalyst::AnalysisResults &stats) { + for (int i = 0; i < NUM_PPCVPUREG; i++) { + ar[i].mipsReg = -1; + ar[i].isDirty = false; + } + for (int i = 0; i < NUM_MIPSVPUREG; i++) { + mr[i].loc = ML_MEM; + mr[i].reg = INVALID_REG; + mr[i].imm = -1; + mr[i].spillLock = false; + } +} + +const PPCReg *PpcRegCacheVPU::GetMIPSAllocationOrder(int &count) const { + // Note that R0 is reserved as scratch for now. + // R1 could be used as it's only used for scratch outside "regalloc space" now. + // R12 is also potentially usable. + // R4-R7 are registers we could use for static allocation or downcount. + // R8 is used to preserve flags in nasty branches. + // R9 and upwards are reserved for jit basics. + if (options_->downcountInRegister) { + static const PPCReg allocationOrder[] = { + /*R14, R15, R16, R17, R18, R19,*/ + R20, R21, R22, R23, R24, R25, + R26, R27, R28, R29, R30, R31, + }; + count = sizeof(allocationOrder) / sizeof(const int); + return allocationOrder; + } else { + static const PPCReg allocationOrder2[] = { + /*R14, R15, R16, R17, R18, R19,*/ + R20, R21, R22, R23, R24, R25, + R26, R27, R28, R29, R30, R31, + }; + count = sizeof(allocationOrder2) / sizeof(const int); + return allocationOrder2; + } +} + +void PpcRegCacheVPU::FlushBeforeCall() { + // R4-R11 are preserved. Others need flushing. + /* + FlushPpcReg(R2); + FlushPpcReg(R3); + FlushPpcReg(R12); + */ +} + +// TODO: Somewhat smarter spilling - currently simply spills the first available, should do +// round robin or FIFO or something. +PPCReg PpcRegCacheVPU::MapReg(MIPSReg mipsReg, int mapFlags) { + // Let's see if it's already mapped. If so we just need to update the dirty flag. + // We don't need to check for ML_NOINIT because we assume that anyone who maps + // with that flag immediately writes a "known" value to the register. + if (mr[mipsReg].loc == ML_PPCREG) { + if (ar[mr[mipsReg].reg].mipsReg != mipsReg) { + ERROR_LOG(HLE, "Register mapping out of sync! %i", mipsReg); + } + if (mapFlags & MAP_DIRTY) { + ar[mr[mipsReg].reg].isDirty = true; + } + return (PPCReg)mr[mipsReg].reg; + } + + // Okay, not mapped, so we need to allocate an ARM register. + + int allocCount; + const PPCReg *allocOrder = GetMIPSAllocationOrder(allocCount); + +allocate: + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i]; + + if (ar[reg].mipsReg == -1) { + // That means it's free. Grab it, and load the value into it (if requested). + ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false; + if (!(mapFlags & MAP_NOINIT)) { + if (mr[mipsReg].loc == ML_MEM) { + if (mipsReg != 0) { + emit_->LWZ((PPCReg)reg, CTXREG, GetMipsRegOffset(mipsReg)); + } else { + // If we get a request to load the zero register, at least we won't spend + // time on a memory access... + emit_->MOVI2R((PPCReg)reg, 0); + } + } else if (mr[mipsReg].loc == ML_IMM) { + emit_->MOVI2R((PPCReg)reg, mr[mipsReg].imm); + ar[reg].isDirty = true; // IMM is always dirty. + } + } + ar[reg].mipsReg = mipsReg; + mr[mipsReg].loc = ML_PPCREG; + mr[mipsReg].reg = (PPCReg)reg; + return (PPCReg)reg; + } + } + + // Still nothing. Let's spill a reg and goto 10. + // TODO: Use age or something to choose which register to spill? + // TODO: Spill dirty regs first? or opposite? + int bestToSpill = -1; + for (int i = 0; i < allocCount; i++) { + int reg = allocOrder[i]; + if (ar[reg].mipsReg != -1 && mr[ar[reg].mipsReg].spillLock) + continue; + bestToSpill = reg; + break; + } + + if (bestToSpill != -1) { + // ERROR_LOG(JIT, "Out of registers at PC %08x - spills register %i.", mips_->pc, bestToSpill); + FlushPpcReg((PPCReg)bestToSpill); + goto allocate; + } + + // Uh oh, we have all them spilllocked.... + ERROR_LOG(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc); + return INVALID_REG; +} + +void PpcRegCacheVPU::MapInIn(MIPSReg rd, MIPSReg rs) { + SpillLock(rd, rs); + MapReg(rd); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCacheVPU::MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad) { + SpillLock(rd, rs); + bool load = !avoidLoad || rd == rs; + MapReg(rd, MAP_DIRTY | (load ? 0 : MAP_NOINIT)); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCacheVPU::MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad) { + SpillLock(rd, rs, rt); + bool load = !avoidLoad || (rd == rs || rd == rt); + MapReg(rd, MAP_DIRTY | (load ? 0 : MAP_NOINIT)); + MapReg(rt); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCacheVPU::MapDirtyDirtyInIn(MIPSReg rd1, MIPSReg rd2, MIPSReg rs, MIPSReg rt, bool avoidLoad) { + SpillLock(rd1, rd2, rs, rt); + bool load1 = !avoidLoad || (rd1 == rs || rd1 == rt); + bool load2 = !avoidLoad || (rd2 == rs || rd2 == rt); + MapReg(rd1, MAP_DIRTY | (load1 ? 0 : MAP_NOINIT)); + MapReg(rd2, MAP_DIRTY | (load2 ? 0 : MAP_NOINIT)); + MapReg(rt); + MapReg(rs); + ReleaseSpillLocks(); +} + +void PpcRegCacheVPU::FlushPpcReg(PPCReg r) { + if (ar[r].mipsReg == -1) { + // Nothing to do, reg not mapped. + return; + } + if (ar[r].mipsReg != -1) { + if (ar[r].isDirty && mr[ar[r].mipsReg].loc == ML_PPCREG) + emit_->STW(r, CTXREG, GetMipsRegOffset(ar[r].mipsReg)); + // IMMs won't be in an ARM reg. + mr[ar[r].mipsReg].loc = ML_MEM; + mr[ar[r].mipsReg].reg = INVALID_REG; + mr[ar[r].mipsReg].imm = 0; + } else { + ERROR_LOG(HLE, "Dirty but no mipsreg?"); + } + ar[r].isDirty = false; + ar[r].mipsReg = -1; +} + +void PpcRegCacheVPU::FlushR(MIPSReg r) { + switch (mr[r].loc) { + case ML_IMM: + // IMM is always "dirty". + emit_->MOVI2R(SREG, mr[r].imm); + emit_->STW(SREG, CTXREG, GetMipsRegOffset(r)); + break; + + case ML_PPCREG: + if (mr[r].reg == INVALID_REG) { + ERROR_LOG(HLE, "FlushMipsReg: MipsReg had bad PpcReg"); + } + if (ar[mr[r].reg].isDirty) { + emit_->STW((PPCReg)mr[r].reg, CTXREG, GetMipsRegOffset(r)); + ar[mr[r].reg].isDirty = false; + } + ar[mr[r].reg].mipsReg = -1; + break; + + case ML_MEM: + // Already there, nothing to do. + break; + + default: + //BAD + break; + } + mr[r].loc = ML_MEM; + mr[r].reg = INVALID_REG; + mr[r].imm = 0; +} + +void PpcRegCacheVPU::FlushAll() { + for (int i = 0; i < NUM_MIPSVPUREG; i++) { + FlushR(i); + } + // Sanity check + for (int i = 0; i < NUM_PPCVPUREG; i++) { + if (ar[i].mipsReg != -1) { + ERROR_LOG(JIT, "Flush fail: ar[%i].mipsReg=%i", i, ar[i].mipsReg); + } + } +} + +void PpcRegCacheVPU::SetImm(MIPSReg r, u32 immVal) { + if (r == 0) + ERROR_LOG(JIT, "Trying to set immediate %08x to r0", immVal); + + // Zap existing value if cached in a reg + if (mr[r].loc == ML_PPCREG) { + ar[mr[r].reg].mipsReg = -1; + ar[mr[r].reg].isDirty = false; + } + mr[r].loc = ML_IMM; + mr[r].imm = immVal; + mr[r].reg = INVALID_REG; +} + +bool PpcRegCacheVPU::IsImm(MIPSReg r) const { + if (r == 0) return true; + return mr[r].loc == ML_IMM; +} + +u32 PpcRegCacheVPU::GetImm(MIPSReg r) const { + if (r == 0) return 0; + if (mr[r].loc != ML_IMM) { + ERROR_LOG(JIT, "Trying to get imm from non-imm register %i", r); + } + return mr[r].imm; +} + +int PpcRegCacheVPU::GetMipsRegOffset(MIPSReg r) { + if (r < 32) + return r * 4; + switch (r) { + case MIPSREG_HI: + return offsetof(MIPSState, hi); + case MIPSREG_LO: + return offsetof(MIPSState, lo); + } + ERROR_LOG(JIT, "bad mips register %i", r); + return 0; // or what? +} + +void PpcRegCacheVPU::SpillLock(MIPSReg r1, MIPSReg r2, MIPSReg r3, MIPSReg r4) { + mr[r1].spillLock = true; + if (r2 != -1) mr[r2].spillLock = true; + if (r3 != -1) mr[r3].spillLock = true; + if (r4 != -1) mr[r4].spillLock = true; +} + +void PpcRegCacheVPU::ReleaseSpillLocks() { + for (int i = 0; i < NUM_MIPSVPUREG; i++) { + mr[i].spillLock = false; + } +} + +void PpcRegCacheVPU::ReleaseSpillLock(MIPSReg reg) { + mr[reg].spillLock = false; +} + +PPCReg PpcRegCacheVPU::R(int mipsReg) { + if (mr[mipsReg].loc == ML_PPCREG) { + return (PPCReg)mr[mipsReg].reg; + } else { + ERROR_LOG(JIT, "Reg %i not in ppc reg. compilerPC = %08x", mipsReg, compilerPC_); + return INVALID_REG; // BAAAD + } +} +#endif diff --git a/Core/MIPS/PPC/PpcRegCacheVPU.h b/Core/MIPS/PPC/PpcRegCacheVPU.h new file mode 100644 index 000000000000..a15681d829a2 --- /dev/null +++ b/Core/MIPS/PPC/PpcRegCacheVPU.h @@ -0,0 +1,106 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + + +/** +PPC reg cache based on arm version +**/ + +#pragma once + +#include "../MIPS.h" +#include "../MIPSAnalyst.h" +#include "ppcEmitter.h" +#include "Core/MIPS/PPC/PpcRegCache.h" + +using namespace PpcGen; + +typedef int MIPSReg; + +struct VPURegPPC { + int mipsReg; // if -1, no mipsreg attached. + bool isDirty; // Should the register be written back? +}; + +struct VPURegMIPS { + // Where is this MIPS register? + PpcJitConstants::RegMIPSLoc loc; + // Data (only one of these is used, depending on loc. Could make a union). + u32 imm; + PPCReg reg; // reg index + bool spillLock; // if true, this register cannot be spilled. + // If loc == ML_MEM, it's back in its location in the CPU context struct. +}; +namespace MIPSComp { + struct PpcJitOptions; +} + +class PpcRegCacheVPU +{ +public: + PpcRegCacheVPU(MIPSState *mips, MIPSComp::PpcJitOptions *options); + ~PpcRegCacheVPU() {} + + void Init(PPCXEmitter *emitter); + void Start(MIPSAnalyst::AnalysisResults &stats); + + // Protect the arm register containing a MIPS register from spilling, to ensure that + // it's being kept allocated. + void SpillLock(MIPSReg reg, MIPSReg reg2 = -1, MIPSReg reg3 = -1, MIPSReg reg4 = -1); + void ReleaseSpillLock(MIPSReg reg); + void ReleaseSpillLocks(); + + void SetImm(MIPSReg reg, u32 immVal); + bool IsImm(MIPSReg reg) const; + u32 GetImm(MIPSReg reg) const; + + // Returns an ARM register containing the requested MIPS register. + PPCReg MapReg(MIPSReg reg, int mapFlags = 0); + void MapInIn(MIPSReg rd, MIPSReg rs); + void MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad = true); + void MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad = true); + void MapDirtyDirtyInIn(MIPSReg rd1, MIPSReg rd2, MIPSReg rs, MIPSReg rt, bool avoidLoad = true); + void FlushPpcReg(PPCReg r); + void FlushR(MIPSReg r); + void FlushBeforeCall(); + void FlushAll(); + + PPCReg R(int preg); // Returns a cached register + + void SetEmitter(PPCXEmitter *emitter) { emit_ = emitter; } + + // For better log output only. + void SetCompilerPC(u32 compilerPC) { compilerPC_ = compilerPC; } + + int GetMipsRegOffset(MIPSReg r); + +private: + const PPCReg *GetMIPSAllocationOrder(int &count) const; + + MIPSState *mips_; + MIPSComp::PpcJitOptions *options_; + PPCXEmitter *emit_; + u32 compilerPC_; + + enum { + NUM_PPCVPUREG = 128, + NUM_MIPSVPUREG = 32, + }; + + VPURegPPC ar[NUM_PPCVPUREG]; + VPURegMIPS mr[NUM_MIPSVPUREG]; +}; diff --git a/Core/MIPS/fake/FakeJit.cpp b/Core/MIPS/fake/FakeJit.cpp index 257005a42935..53dcd30d3732 100644 --- a/Core/MIPS/fake/FakeJit.cpp +++ b/Core/MIPS/fake/FakeJit.cpp @@ -16,6 +16,7 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include "Common/Serialize/Serializer.h" +#include "Common/Serialize/SerializeFuncs.h" #include "Core/Reporting.h" #include "Core/Config.h" #include "Core/Core.h" @@ -51,9 +52,9 @@ void FakeJit::DoState(PointerWrap &p) if (!s) return; - p.Do(js.startDefaultPrefix); + Do(p, js.startDefaultPrefix); if (s >= 2) { - p.Do(js.hasSetRounding); + Do(p, js.hasSetRounding); js.lastSetRounding = 0; } else { js.hasSetRounding = 1; @@ -68,13 +69,17 @@ void FakeJit::DoDummyState(PointerWrap &p) return; bool dummy = false; - p.Do(dummy); + Do(p , dummy); if (s >= 2) { dummy = true; - p.Do(dummy); + Do(p, dummy); } } +void FakeJit::UpdateFCR31() +{ +} + void FakeJit::FlushAll() { //gpr.FlushAll(); @@ -146,6 +151,12 @@ void FakeJit::Comp_RunBlock(MIPSOpcode op) ERROR_LOG(JIT, "Comp_RunBlock should never be reached!"); } +void FakeJit::LinkBlock(u8 *exitPoint, const u8 *checkedEntry) { +} + +void FakeJit::UnlinkBlock(u8 *checkedEntry, u32 originalAddress) { +} + bool FakeJit::ReplaceJalTo(u32 dest) { return true; } @@ -213,6 +224,16 @@ void FakeJit::WriteSyscallExit() { } +MIPSOpcode FakeJit::GetOriginalOp(MIPSOpcode op) { + JitBlockCache *bc = GetBlockCache(); + int block_num = bc->GetBlockNumberFromEmuHackOp(op, true); + if (block_num >= 0) { + return bc->GetOriginalFirstOp(block_num); + } else { + return op; + } +} + #define _RS ((op>>21) & 0x1F) #define _RT ((op>>16) & 0x1F) #define _RD ((op>>11) & 0x1F) diff --git a/Core/MIPS/fake/FakeJit.h b/Core/MIPS/fake/FakeJit.h index d23303a65463..10a260179917 100644 --- a/Core/MIPS/fake/FakeJit.h +++ b/Core/MIPS/fake/FakeJit.h @@ -19,6 +19,7 @@ #include "Common/FakeEmitter.h" #include "Core/MIPS/JitCommon/JitState.h" +#include "Core/MIPS/JitCommon/JitCommon.h" #include "Core/MIPS/JitCommon/JitBlockCache.h" #include "../MIPSVFPUUtils.h" @@ -30,7 +31,7 @@ namespace MIPSComp { typedef int FakeReg; -class FakeJit : public FakeGen::FakeXCodeBlock { +class FakeJit : public FakeGen::FakeXCodeBlock , public JitInterface, public MIPSFrontendInterface { public: FakeJit(MIPSState *mips); @@ -47,6 +48,7 @@ class FakeJit : public FakeGen::FakeXCodeBlock { const u8 *DoJit(u32 em_address, JitBlock *b); bool DescribeCodePtr(const u8 *ptr, std::string &name); + MIPSOpcode GetOriginalOp(MIPSOpcode op); void CompileDelaySlot(int flags); void EatInstruction(MIPSOpcode op); @@ -126,11 +128,20 @@ class FakeJit : public FakeGen::FakeXCodeBlock { int Replace_fabsf() { return 0; } JitBlockCache *GetBlockCache() { return &blocks; } + JitBlockCacheDebugInterface *GetBlockCacheDebugInterface() override { return &blocks; } + + std::vector SaveAndClearEmuHackOps() override { return blocks.SaveAndClearEmuHackOps(); } + void RestoreSavedEmuHackOps(std::vector saved) override { blocks.RestoreSavedEmuHackOps(saved); } void ClearCache(); void InvalidateCacheAt(u32 em_address, int length = 4); + void UpdateFCR31(); void EatPrefix() { js.EatPrefix(); } + const u8 *GetDispatcher() const { return dispatcher; } + + void LinkBlock(u8 *exitPoint, const u8 *checkedEntry); + void UnlinkBlock(u8 *checkedEntry, u32 originalAddress); private: void GenerateFixedCode(); diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp index 1a60d42b9cc2..ab91e9b53c72 100644 --- a/Core/MemMap.cpp +++ b/Core/MemMap.cpp @@ -165,8 +165,8 @@ static bool Memory_TryBase(u32 flags) { *view.out_ptr = (u8*)g_arena.CreateView( position, view.size, base + view.virtual_address); if (!*view.out_ptr) { - goto bail; DEBUG_LOG(MEMMAP, "Failed at view %d", i); + goto bail; } #else if (CanIgnoreView(view)) { @@ -297,6 +297,7 @@ bool Init() { int flags = 0; if (!MemoryMap_Setup(flags)) { + _assert_msg_(0, "MemoryMap_Setup: Failed."); return false; } diff --git a/Core/MemMap.h b/Core/MemMap.h index d0cac9f0828a..77b50ee920d5 100644 --- a/Core/MemMap.h +++ b/Core/MemMap.h @@ -175,9 +175,9 @@ inline u32 ReadUnchecked_U32(const u32 address) { inline float ReadUnchecked_Float(const u32 address) { #ifdef MASKED_PSP_MEMORY - return *(float *)(base + (address & MEMVIEW32_MASK)); + return *(float_le *)(base + (address & MEMVIEW32_MASK)); #else - return *(float *)(base + address); + return *(float_le *)(base + address); #endif } @@ -207,9 +207,9 @@ inline void WriteUnchecked_U32(u32 data, u32 address) { inline void WriteUnchecked_Float(float data, u32 address) { #ifdef MASKED_PSP_MEMORY - *(float *)(base + (address & MEMVIEW32_MASK)) = data; + *(float_le *)(base + (address & MEMVIEW32_MASK)) = data; #else - *(float *)(base + address) = data; + *(float_le *)(base + address) = data; #endif } diff --git a/Core/Screenshot.cpp b/Core/Screenshot.cpp index 74c10ca2fe61..5342d5525a42 100644 --- a/Core/Screenshot.cpp +++ b/Core/Screenshot.cpp @@ -121,9 +121,9 @@ static bool WriteScreenshotToPNG(png_imagep image, const char *filename, int con static bool ConvertPixelTo8888RGBA(GPUDebugBufferFormat fmt, u8 &r, u8 &g, u8 &b, u8 &a, const void *buffer, int offset, bool rev) { const u8 *buf8 = (const u8 *)buffer; - const u16 *buf16 = (const u16 *)buffer; - const u32 *buf32 = (const u32 *)buffer; - const float *fbuf = (const float *)buffer; + const u16_le *buf16 = (const u16_le *)buffer; + const u32_le *buf32 = (const u32_le *)buffer; + const float_le *fbuf = (const float_le *)buffer; // NOTE: a and r might be the same channel. This is used for RGB. @@ -133,7 +133,7 @@ static bool ConvertPixelTo8888RGBA(GPUDebugBufferFormat fmt, u8 &r, u8 &g, u8 &b case GPU_DBG_FORMAT_565: src = buf16[offset]; if (rev) { - src = bswap16(src); + src = swap16(src); } a = 255; r = Convert5To8((src >> 0) & 0x1F); @@ -143,7 +143,7 @@ static bool ConvertPixelTo8888RGBA(GPUDebugBufferFormat fmt, u8 &r, u8 &g, u8 &b case GPU_DBG_FORMAT_5551: src = buf16[offset]; if (rev) { - src = bswap16(src); + src = swap16(src); } a = (src >> 15) ? 255 : 0; r = Convert5To8((src >> 0) & 0x1F); @@ -153,7 +153,7 @@ static bool ConvertPixelTo8888RGBA(GPUDebugBufferFormat fmt, u8 &r, u8 &g, u8 &b case GPU_DBG_FORMAT_4444: src = buf16[offset]; if (rev) { - src = bswap16(src); + src = swap16(src); } a = Convert4To8((src >> 12) & 0xF); r = Convert4To8((src >> 0) & 0xF); @@ -163,7 +163,7 @@ static bool ConvertPixelTo8888RGBA(GPUDebugBufferFormat fmt, u8 &r, u8 &g, u8 &b case GPU_DBG_FORMAT_8888: src = buf32[offset]; if (rev) { - src = bswap32(src); + src = swap32(src); } a = (src >> 24) & 0xFF; r = (src >> 0) & 0xFF; diff --git a/Core/TextureReplacer.cpp b/Core/TextureReplacer.cpp index 7cdb34c23a8d..48bce75647e5 100644 --- a/Core/TextureReplacer.cpp +++ b/Core/TextureReplacer.cpp @@ -465,25 +465,25 @@ void TextureReplacer::NotifyTextureDecoded(const ReplacedTextureDecodeInfo &repl saveBuf.resize((pitch * h) / sizeof(u16)); switch (replacedInfo.fmt) { case ReplacedTextureFormat::F_5650: - ConvertRGB565ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertRGB565ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_5551: - ConvertRGBA5551ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertRGBA5551ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_4444: - ConvertRGBA4444ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertRGBA4444ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_0565_ABGR: - ConvertBGR565ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertBGR565ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_1555_ABGR: - ConvertABGR1555ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertABGR1555ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_4444_ABGR: - ConvertABGR4444ToRGBA8888(saveBuf.data(), (const u16 *)data, (pitch * h) / sizeof(u16)); + ConvertABGR4444ToRGBA8888(saveBuf.data(), (const u16_le *)data, (pitch * h) / sizeof(u16)); break; case ReplacedTextureFormat::F_8888_BGRA: - ConvertBGRA8888ToRGBA8888(saveBuf.data(), (const u32 *)data, (pitch * h) / sizeof(u32)); + ConvertBGRA8888ToRGBA8888(saveBuf.data(), (const u32_le *)data, (pitch * h) / sizeof(u32)); break; case ReplacedTextureFormat::F_8888: // Impossible. Just so we can get warnings on other missed formats. @@ -653,7 +653,7 @@ void ReplacedTexture::Load(int level, void *out, int rowPitch) { if (!checkedAlpha) { // This will only check the hashed bits. - CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32 *)out, rowPitch / sizeof(u32), png.width, png.height); + CheckAlphaResult res = CheckAlphaRGBA8888Basic((u32_le *)out, rowPitch / sizeof(u32), png.width, png.height); if (res == CHECKALPHA_ANY || level == 0) { alphaStatus_ = ReplacedTextureAlpha(res); } diff --git a/Core/TextureReplacer.h b/Core/TextureReplacer.h index d90f373c4ecc..2a6462532787 100644 --- a/Core/TextureReplacer.h +++ b/Core/TextureReplacer.h @@ -23,6 +23,7 @@ #include #include "Common/Common.h" #include "Common/MemoryUtil.h" +#include "Common/Swap.h" #include "GPU/ge_constants.h" class IniFile; @@ -197,7 +198,7 @@ class TextureReplacer { std::string HashName(u64 cachekey, u32 hash, int level); void PopulateReplacement(ReplacedTexture *result, u64 cachekey, u32 hash, int w, int h); - SimpleBuf saveBuf; + SimpleBuf saveBuf; bool enabled_ = false; bool allowVideo_ = false; bool ignoreAddress_ = false; diff --git a/Core/Util/AudioFormat.cpp b/Core/Util/AudioFormat.cpp index 54610949f30a..0e82347d1062 100644 --- a/Core/Util/AudioFormat.cpp +++ b/Core/Util/AudioFormat.cpp @@ -24,7 +24,7 @@ #include #endif -void AdjustVolumeBlockStandard(s16 *out, s16 *in, size_t size, int leftVol, int rightVol) { +void AdjustVolumeBlockStandard(s16_le *out, s16_le *in, size_t size, int leftVol, int rightVol) { #ifdef _M_SSE if (leftVol <= 0x7fff && -leftVol <= 0x8000 && rightVol <= 0x7fff && -rightVol <= 0x8000) { __m128i volume = _mm_set_epi16(leftVol, rightVol, leftVol, rightVol, leftVol, rightVol, leftVol, rightVol); diff --git a/Core/Util/AudioFormat.h b/Core/Util/AudioFormat.h index eb5098ad01a1..fe9c17683d10 100644 --- a/Core/Util/AudioFormat.h +++ b/Core/Util/AudioFormat.h @@ -20,6 +20,7 @@ #include "ppsspp_config.h" #include "Common/Common.h" #include "Common/CommonTypes.h" +#include "Common/Swap.h" #include "Core/Util/AudioFormatNEON.h" #define IS_LITTLE_ENDIAN (*(const u16 *)"\0\xff" >= 0x100) @@ -67,7 +68,7 @@ static inline s16 ApplySampleVolume20Bit(s16 sample, int vol20) { } void SetupAudioFormats(); -void AdjustVolumeBlockStandard(s16 *out, s16 *in, size_t size, int leftVol, int rightVol); +void AdjustVolumeBlockStandard(s16_le *out, s16_le *in, size_t size, int leftVol, int rightVol); void ConvertS16ToF32(float *ou, const s16 *in, size_t size); #ifdef _M_SSE @@ -75,6 +76,6 @@ void ConvertS16ToF32(float *ou, const s16 *in, size_t size); #elif PPSSPP_ARCH(ARM64) #define AdjustVolumeBlock AdjustVolumeBlockNEON #else -typedef void (*AdjustVolumeBlockFunc)(s16 *out, s16 *in, size_t size, int leftVol, int rightVol); +typedef void (*AdjustVolumeBlockFunc)(s16_le *out, s16_le *in, size_t size, int leftVol, int rightVol); extern AdjustVolumeBlockFunc AdjustVolumeBlock; #endif diff --git a/Core/Util/PPGeDraw.cpp b/Core/Util/PPGeDraw.cpp index 05484bd8af16..0c2fc5fd3c88 100644 --- a/Core/Util/PPGeDraw.cpp +++ b/Core/Util/PPGeDraw.cpp @@ -274,7 +274,7 @@ void __PPGeInit() { textDrawerImages.clear(); INFO_LOG(SCEGE, "PPGe drawing library initialized. DL: %08x Data: %08x Atlas: %08x (%i) Args: %08x", - dlPtr, dataPtr, atlasPtr, atlasSize, listArgs.ptr); + dlPtr, dataPtr, atlasPtr, atlasSize, (u32)listArgs.ptr); } void __PPGeDoState(PointerWrap &p) diff --git a/Core/Util/PortManager.cpp b/Core/Util/PortManager.cpp index bc6b18ace2a5..900338390845 100644 --- a/Core/Util/PortManager.cpp +++ b/Core/Util/PortManager.cpp @@ -67,7 +67,9 @@ PortManager::~PortManager() { void PortManager::Terminate() { VERBOSE_LOG(SCENET, "PortManager::Terminate()"); if (urls) { +#ifndef __wiiu__ FreeUPNPUrls(urls); +#endif free(urls); urls = NULL; } @@ -126,6 +128,7 @@ bool PortManager::Initialize(const unsigned int timeout) { memset(urls, 0, sizeof(struct UPNPUrls)); memset(datas, 0, sizeof(struct IGDdatas)); +#ifndef __wiiu__ devlist = upnpDiscover(timeout, NULL, NULL, localport, ipv6, ttl, &error); if (devlist) { @@ -175,6 +178,7 @@ bool PortManager::Initialize(const unsigned int timeout) { RefreshPortList(); return true; } +#endif ERROR_LOG(SCENET, "PortManager - upnpDiscover failed (error: %i) or No UPnP device detected", error); auto n = GetI18NCategory("Networking"); @@ -200,6 +204,7 @@ bool PortManager::Add(const char* protocol, unsigned short port, unsigned short if (g_Config.bEnableUPnP) WARN_LOG(SCENET, "PortManager::Add - the init was not done !"); return false; } +#ifndef __wiiu__ sprintf(port_str, "%d", port); sprintf(intport_str, "%d", intport); // Only add new port map if it's not previously created by PPSSPP for current IP @@ -233,6 +238,7 @@ bool PortManager::Add(const char* protocol, unsigned short port, unsigned short // Keep tracks of it to be restored later if it belongs to others if (el_it != m_otherPortList.end()) el_it->taken = true; } +#endif return true; } @@ -246,6 +252,7 @@ bool PortManager::Remove(const char* protocol, unsigned short port) { return false; } sprintf(port_str, "%d", port); +#ifndef __wiiu__ int r = UPNP_DeletePortMapping(urls->controlURL, datas->first.servicetype, port_str, protocol, NULL); if (r != 0) { @@ -257,6 +264,7 @@ bool PortManager::Remove(const char* protocol, unsigned short port) { return false; } } +#endif for (auto it = m_portList.begin(); it != m_portList.end(); ) { (it->first == port_str && it->second == protocol) ? it = m_portList.erase(it) : ++it; } @@ -271,6 +279,7 @@ bool PortManager::Restore() { if (g_Config.bEnableUPnP) WARN_LOG(SCENET, "PortManager::Remove - the init was not done !"); return false; } +#ifndef __wiiu__ for (auto it = m_otherPortList.begin(); it != m_otherPortList.end(); ++it) { if (it->taken) { auto port_str = it->extPort_str; @@ -302,6 +311,7 @@ bool PortManager::Restore() { } } } +#endif return true; } @@ -324,6 +334,7 @@ bool PortManager::Clear() { if (g_Config.bEnableUPnP) WARN_LOG(SCENET, "PortManager::Clear - the init was not done !"); return false; } +#ifndef __wiiu__ //unsigned int num = 0; //UPNP_GetPortMappingNumberOfEntries(urls->controlURL, datas->first.servicetype, &num); // Not supported by many routers do { @@ -356,6 +367,7 @@ bool PortManager::Clear() { } i++; } while (r == 0); +#endif return true; } diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 8285191699b9..eb1e3e09f860 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -195,8 +195,11 @@ bool DrawEngineCommon::TestBoundingBox(void* control_points, int vertexCount, u3 // Try to skip NormalizeVertices if it's pure positions. No need to bother with a vertex decoder // and a large vertex format. - if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_FLOAT) { - verts = (float *)control_points; + if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_FLOAT) { + const float_le *vtx = (const float_le*)control_points; + for (int i = 0; i < vertexCount * 3; i++) { + verts[i] = vtx[i]; + } *bytesRead = 3 * sizeof(float) * vertexCount; } else if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_8BIT) { const s8 *vtx = (const s8 *)control_points; @@ -205,7 +208,7 @@ bool DrawEngineCommon::TestBoundingBox(void* control_points, int vertexCount, u3 } *bytesRead = 3 * sizeof(s8) * vertexCount; } else if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_16BIT) { - const s16 *vtx = (const s16*)control_points; + const s16_le *vtx = (const s16_le*)control_points; for (int i = 0; i < vertexCount * 3; i++) { verts[i] = vtx[i] * (1.0f / 32768.0f); } @@ -270,8 +273,8 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector +#include #include "IndexGenerator.h" #include "Common/Common.h" @@ -226,7 +227,7 @@ void IndexGenerator::TranslateList(int numInds, const ITypeLE *inds, int indexOf indexOffset = index_ - indexOffset; // We only bother doing this minor optimization in triangle list, since it's by far the most // common operation that can benefit. - if (sizeof(ITypeLE) == sizeof(inds_[0]) && indexOffset == 0 && clockwise) { + if (std::is_same::value && indexOffset == 0 && clockwise) { memcpy(inds_, inds, numInds * sizeof(ITypeLE)); inds_ += numInds; count_ += numInds; diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h index f117341e7d8d..30d5a915fdcd 100644 --- a/GPU/Common/ShaderId.h +++ b/GPU/Common/ShaderId.h @@ -25,7 +25,7 @@ enum VShaderBit : uint8_t { VS_BIT_HAS_TEXCOORD_TESS = 13, // 1 bit VS_BIT_NORM_REVERSE_TESS = 14, // 1 bit VS_BIT_HAS_NORMAL_TESS = 15, // 1 bit - VS_BIT_UVGEN_MODE = 16, + VS_BIT_UVGEN_MODE = 16, // 2 bits VS_BIT_UVPROJ_MODE = 18, // 2, can overlap with LS0 VS_BIT_LS0 = 18, // 2 VS_BIT_LS1 = 20, // 2 diff --git a/GPU/Common/ShaderTranslation.cpp b/GPU/Common/ShaderTranslation.cpp index e0b58908fcfb..3031450388a7 100644 --- a/GPU/Common/ShaderTranslation.cpp +++ b/GPU/Common/ShaderTranslation.cpp @@ -66,12 +66,12 @@ static EShLanguage GetLanguage(const Draw::ShaderStage stage) { void ShaderTranslationInit() { // TODO: We have TLS issues on UWP -#if !PPSSPP_PLATFORM(UWP) +#if !PPSSPP_PLATFORM(UWP) && !PPSSPP_PLATFORM(WIIU) glslang::InitializeProcess(); #endif } void ShaderTranslationShutdown() { -#if !PPSSPP_PLATFORM(UWP) +#if !PPSSPP_PLATFORM(UWP) && !PPSSPP_PLATFORM(WIIU) glslang::FinalizeProcess(); #endif } @@ -231,7 +231,7 @@ bool TranslateShader(std::string *dest, ShaderLanguage destLang, TranslatedShade *errorMessage = ""; } -#if PPSSPP_PLATFORM(UWP) +#if PPSSPP_PLATFORM(UWP) || PPSSPP_PLATFORM(WIIU) return false; #endif diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp index 2460af74dc21..9d5429b44053 100644 --- a/GPU/Common/SplineCommon.cpp +++ b/GPU/Common/SplineCommon.cpp @@ -50,7 +50,7 @@ class SimpleBufferManager { namespace Spline { -static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) { +static void CopyQuadIndex(u16_le *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) { if (type == GE_PATCHPRIM_LINES) { *(indices++) = idx0; *(indices++) = idx2; @@ -68,7 +68,7 @@ static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, c } } -void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total) { +void BuildIndex(u16_le *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total) { for (int v = 0; v < num_v; ++v) { for (int u = 0; u < num_u; ++u) { int idx0 = v * (num_u + 1) + u + total; // Top left @@ -543,7 +543,7 @@ void DrawEngineCommon::SubmitCurve(const void *control_points, const void *indic OutputBuffers output; output.vertices = (SimpleVertex *)(decoded + DECODED_VERTEX_BUFFER_SIZE / 2); - output.indices = decIndex; + output.indices = (u16_le*)decIndex; output.count = 0; surface.Init(DECODED_VERTEX_BUFFER_SIZE / 2 / vertexSize); diff --git a/GPU/Common/SplineCommon.h b/GPU/Common/SplineCommon.h index 26f4ae7a1eef..b3fc0deafb65 100644 --- a/GPU/Common/SplineCommon.h +++ b/GPU/Common/SplineCommon.h @@ -28,6 +28,7 @@ // PSP compatible format so we can use the end of the pipeline in beziers etc struct SimpleVertex { + SimpleVertex() {} float uv[2]; union { u8 color[4]; @@ -41,7 +42,7 @@ class SimpleBufferManager; namespace Spline { -void BuildIndex(u16 *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total = 0); +void BuildIndex(u16_le *indices, int &count, int num_u, int num_v, GEPatchPrimType prim_type, int total = 0); enum SplineQuality { LOW_QUALITY = 0, @@ -109,7 +110,7 @@ struct BezierSurface : public SurfaceInfo { return index_v * (tess_u + 1) + index_u + num_verts_per_patch * patch_index; } - void BuildIndex(u16 *indices, int &count) const { + void BuildIndex(u16_le *indices, int &count) const { for (int patch_u = 0; patch_u < num_patches_u; ++patch_u) { for (int patch_v = 0; patch_v < num_patches_v; ++patch_v) { int patch_index = patch_v * num_patches_u + patch_u; @@ -146,7 +147,7 @@ struct SplineSurface : public SurfaceInfo { return index_v * num_vertices_u + index_u; } - void BuildIndex(u16 *indices, int &count) const { + void BuildIndex(u16_le *indices, int &count) const { Spline::BuildIndex(indices, count, num_patches_u * tess_u, num_patches_v * tess_v, primType); } }; @@ -198,7 +199,7 @@ struct ControlPoints { struct OutputBuffers { SimpleVertex *vertices; - u16 *indices; + u16_le *indices; int count; }; diff --git a/GPU/Common/StencilCommon.cpp b/GPU/Common/StencilCommon.cpp index 29af15c020ed..78c9abde474b 100644 --- a/GPU/Common/StencilCommon.cpp +++ b/GPU/Common/StencilCommon.cpp @@ -15,10 +15,11 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "Common/Swap.h" #include "GPU/Common/StencilCommon.h" u8 StencilBits5551(const u8 *ptr8, u32 numPixels) { - const u32 *ptr = (const u32 *)ptr8; + const u32_le *ptr = (const u32_le *)ptr8; for (u32 i = 0; i < numPixels / 2; ++i) { if (ptr[i] & 0x80008000) { @@ -29,7 +30,7 @@ u8 StencilBits5551(const u8 *ptr8, u32 numPixels) { } u8 StencilBits4444(const u8 *ptr8, u32 numPixels) { - const u32 *ptr = (const u32 *)ptr8; + const u32_le *ptr = (const u32_le *)ptr8; u32 bits = 0; for (u32 i = 0; i < numPixels / 2; ++i) { @@ -40,7 +41,7 @@ u8 StencilBits4444(const u8 *ptr8, u32 numPixels) { } u8 StencilBits8888(const u8 *ptr8, u32 numPixels) { - const u32 *ptr = (const u32 *)ptr8; + const u32_le *ptr = (const u32_le *)ptr8; u32 bits = 0; for (u32 i = 0; i < numPixels; ++i) { diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 73c8a5b918be..b3a255fea9dd 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -114,8 +114,8 @@ TextureCacheCommon::TextureCacheCommon(Draw::DrawContext *draw) decimationCounter_ = TEXCACHE_DECIMATION_INTERVAL; // TODO: Clamp down to 256/1KB? Need to check mipmapShareClut and clamp loadclut. - clutBufRaw_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB - clutBufConverted_ = (u32 *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB + clutBufRaw_ = (u32_le *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB + clutBufConverted_ = (u32_le *)AllocateAlignedMemory(1024 * sizeof(u32), 16); // 4KB // Zap so we get consistent behavior if the game fails to load some of the CLUT. memset(clutBufRaw_, 0, 1024 * sizeof(u32)); @@ -1350,18 +1350,18 @@ u32 TextureCacheCommon::EstimateTexMemoryUsage(const TexCacheEntry *entry) { static void ReverseColors(void *dstBuf, const void *srcBuf, GETextureFormat fmt, int numPixels, bool useBGRA) { switch (fmt) { case GE_TFMT_4444: - ConvertRGBA4444ToABGR4444((u16 *)dstBuf, (const u16 *)srcBuf, numPixels); + ConvertRGBA4444ToABGR4444((u16_le *)dstBuf, (const u16_le *)srcBuf, numPixels); break; // Final Fantasy 2 uses this heavily in animated textures. case GE_TFMT_5551: - ConvertRGBA5551ToABGR1555((u16 *)dstBuf, (const u16 *)srcBuf, numPixels); + ConvertRGBA5551ToABGR1555((u16_le *)dstBuf, (const u16_le *)srcBuf, numPixels); break; case GE_TFMT_5650: - ConvertRGB565ToBGR565((u16 *)dstBuf, (const u16 *)srcBuf, numPixels); + ConvertRGB565ToBGR565((u16_le *)dstBuf, (const u16_le *)srcBuf, numPixels); break; default: if (useBGRA) { - ConvertRGBA8888ToBGRA8888((u32 *)dstBuf, (const u32 *)srcBuf, numPixels); + ConvertRGBA8888ToBGRA8888((u32_le *)dstBuf, (const u32_le *)srcBuf, numPixels); } else { // No need to convert RGBA8888, right order already if (dstBuf != srcBuf) @@ -1371,7 +1371,7 @@ static void ReverseColors(void *dstBuf, const void *srcBuf, GETextureFormat fmt, } } -static inline void ConvertFormatToRGBA8888(GETextureFormat format, u32 *dst, const u16 *src, u32 numPixels) { +static inline void ConvertFormatToRGBA8888(GETextureFormat format, u32_le *dst, const u16_le *src, u32 numPixels) { switch (format) { case GE_TFMT_4444: ConvertRGBA4444ToRGBA8888(dst, src, numPixels); @@ -1388,7 +1388,7 @@ static inline void ConvertFormatToRGBA8888(GETextureFormat format, u32 *dst, con } } -static inline void ConvertFormatToRGBA8888(GEPaletteFormat format, u32 *dst, const u16 *src, u32 numPixels) { +static inline void ConvertFormatToRGBA8888(GEPaletteFormat format, u32_le *dst, const u16_le *src, u32 numPixels) { // The supported values are 1:1 identical. ConvertFormatToRGBA8888(GETextureFormat(format), dst, src, numPixels); } @@ -1430,24 +1430,24 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm // Here, reverseColors means the CLUT is already reversed. if (reverseColors) { for (int y = 0; y < h; ++y) { - DeIndexTexture4Optimal((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clutAlphaLinearColor_); + DeIndexTexture4Optimal((u16_le *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clutAlphaLinearColor_); } } else { for (int y = 0; y < h; ++y) { - DeIndexTexture4OptimalRev((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clutAlphaLinearColor_); + DeIndexTexture4OptimalRev((u16_le *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clutAlphaLinearColor_); } } } else { - const u16 *clut = GetCurrentClut() + clutSharingOffset; + const u16_le *clut = GetCurrentClut() + clutSharingOffset; if (expandTo32bit && !reverseColors) { // We simply expand the CLUT to 32-bit, then we deindex as usual. Probably the fastest way. ConvertFormatToRGBA8888(clutformat, expandClut_, clut, 16); for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_); + DeIndexTexture4((u32_le *)(out + outPitch * y), texptr + (bufw * y) / 2, w, expandClut_); } } else { for (int y = 0; y < h; ++y) { - DeIndexTexture4((u16 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); + DeIndexTexture4((u16_le *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); } } } @@ -1456,9 +1456,9 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_CMODE_32BIT_ABGR8888: { - const u32 *clut = GetCurrentClut() + clutSharingOffset; + const u32_le *clut = GetCurrentClut() + clutSharingOffset; for (int y = 0; y < h; ++y) { - DeIndexTexture4((u32 *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); + DeIndexTexture4((u32_le *)(out + outPitch * y), texptr + (bufw * y) / 2, w, clut); } } break; @@ -1493,7 +1493,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm } } else if (expandTo32bit) { for (int y = 0; y < h; ++y) { - ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)texptr + bufw * y, w); + ConvertFormatToRGBA8888(format, (u32_le *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w); } } else { for (int y = 0; y < h; ++y) { @@ -1518,7 +1518,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm } } else if (expandTo32bit) { for (int y = 0; y < h; ++y) { - ConvertFormatToRGBA8888(format, (u32 *)(out + outPitch * y), (const u16 *)unswizzled + bufw * y, w); + ConvertFormatToRGBA8888(format, (u32_le *)(out + outPitch * y), (const u16_le *)unswizzled + bufw * y, w); } } else { for (int y = 0; y < h; ++y) { @@ -1565,7 +1565,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_TFMT_DXT1: { int minw = std::min(bufw, w); - u32 *dst = (u32 *)out; + u32_le *dst = (u32_le *)out; int outPitch32 = outPitch / sizeof(u32); DXT1Block *src = (DXT1Block*)texptr; @@ -1587,7 +1587,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_TFMT_DXT3: { int minw = std::min(bufw, w); - u32 *dst = (u32 *)out; + u32_le *dst = (u32_le *)out; int outPitch32 = outPitch / sizeof(u32); DXT3Block *src = (DXT3Block*)texptr; @@ -1609,7 +1609,7 @@ void TextureCacheCommon::DecodeTextureLevel(u8 *out, int outPitch, GETextureForm case GE_TFMT_DXT5: { int minw = std::min(bufw, w); - u32 *dst = (u32 *)out; + u32_le *dst = (u32_le *)out; int outPitch32 = outPitch / sizeof(u32); DXT5Block *src = (DXT5Block*)texptr; @@ -1646,8 +1646,8 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const int palFormat = gstate.getClutPaletteFormat(); - const u16 *clut16 = (const u16 *)clutBuf_; - const u32 *clut32 = (const u32 *)clutBuf_; + const u16_le *clut16 = (const u16_le *)clutBuf_; + const u32_le *clut32 = (const u32_le *)clutBuf_; if (expandTo32Bit && palFormat != GE_CMODE_32BIT_ABGR8888) { ConvertFormatToRGBA8888(GEPaletteFormat(palFormat), expandClut_, clut16, 256); @@ -1663,19 +1663,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16_le *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut16); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16_le *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut16); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u16 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16); + DeIndexTexture((u16_le *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut16); } break; } @@ -1687,19 +1687,19 @@ void TextureCacheCommon::ReadIndexedTex(u8 *out, int outPitch, int level, const switch (bytesPerIndex) { case 1: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32_le *)(out + outPitch * y), (const u8 *)texptr + bufw * y, w, clut32); } break; case 2: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32_le *)(out + outPitch * y), (const u16_le *)texptr + bufw * y, w, clut32); } break; case 4: for (int y = 0; y < h; ++y) { - DeIndexTexture((u32 *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32); + DeIndexTexture((u32_le *)(out + outPitch * y), (const u32_le *)texptr + bufw * y, w, clut32); } break; } diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index 630ef43fa9e5..cba3766291a3 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -348,10 +348,10 @@ class TextureCacheCommon { u32 clutHash_ = 0; // Raw is where we keep the original bytes. Converted is where we swap colors if necessary. - u32 *clutBufRaw_; - u32 *clutBufConverted_; + u32_le *clutBufRaw_; + u32_le *clutBufConverted_; // This is the active one. - u32 *clutBuf_; + u32_le *clutBuf_; u32 clutLastFormat_; u32 clutTotalBytes_; u32 clutMaxBytes_; @@ -370,7 +370,7 @@ class TextureCacheCommon { bool isBgraBackend_; - u32 expandClut_[256]; + u32_le expandClut_[256]; }; inline bool TexCacheEntry::Matches(u16 dim2, u8 format2, u8 maxLevel2) const { diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index f02094bfcd69..992c365b8e31 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -186,7 +186,7 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) { #else u32 check = 0; const u32 size_u32 = size / 4; - const u32 *p = (const u32 *)checkp; + const u32_le *p = (const u32_le *)checkp; for (u32 i = 0; i < size_u32; i += 4) { check += p[i + 0]; check ^= p[i + 1]; @@ -329,9 +329,9 @@ class DXTDecoder { public: inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha); inline void DecodeAlphaDXT5(const DXT5Block *src); - inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height); - inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height); - inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height); + inline void WriteColorsDXT1(u32_le *dst, const DXT1Block *src, int pitch, int height); + inline void WriteColorsDXT3(u32_le *dst, const DXT3Block *src, int pitch, int height); + inline void WriteColorsDXT5(u32_le *dst, const DXT5Block *src, int pitch, int height); protected: u32 colors_[4]; @@ -409,7 +409,7 @@ void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) { } } -void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int height) { +void DXTDecoder::WriteColorsDXT1(u32_le *dst, const DXT1Block *src, int pitch, int height) { for (int y = 0; y < height; y++) { int colordata = src->lines[y]; for (int x = 0; x < 4; x++) { @@ -420,7 +420,7 @@ void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int } } -void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int height) { +void DXTDecoder::WriteColorsDXT3(u32_le *dst, const DXT3Block *src, int pitch, int height) { for (int y = 0; y < height; y++) { int colordata = src->color.lines[y]; u32 alphadata = src->alphaLines[y]; @@ -433,7 +433,7 @@ void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int } } -void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int height) { +void DXTDecoder::WriteColorsDXT5(u32_le *dst, const DXT5Block *src, int pitch, int height) { // 48 bits, 3 bit index per pixel, 12 bits per line. u64 alphadata = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2; @@ -449,20 +449,20 @@ void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int } // This could probably be done faster by decoding two or four blocks at a time with SSE/NEON. -void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) { +void DecodeDXT1Block(u32_le *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha) { DXTDecoder dxt; dxt.DecodeColors(src, ignore1bitAlpha); dxt.WriteColorsDXT1(dst, src, pitch, height); } -void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height) { +void DecodeDXT3Block(u32_le *dst, const DXT3Block *src, int pitch, int height) { DXTDecoder dxt; dxt.DecodeColors(&src->color, true); dxt.WriteColorsDXT3(dst, src, pitch, height); } // The alpha channel is not 100% correct -void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height) { +void DecodeDXT5Block(u32_le *dst, const DXT5Block *src, int pitch, int height) { DXTDecoder dxt; dxt.DecodeColors(&src->color, true); dxt.DecodeAlphaDXT5(src); @@ -603,7 +603,7 @@ CheckAlphaResult CheckAlphaRGBA5551SSE2(const u32 *pixelData, int stride, int w, } #endif -CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h) { +CheckAlphaResult CheckAlphaRGBA8888Basic(const u32_le *pixelData, int stride, int w, int h) { // Use SIMD if aligned to 16 bytes / 4 pixels (almost always the case.) if ((w & 3) == 0 && (stride & 3) == 0) { #ifdef _M_SSE @@ -615,7 +615,7 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w #endif } - const u32 *p = pixelData; + const u32_le *p = (u32_le *)pixelData; for (int y = 0; y < h; ++y) { u32 bits = 0xFF000000; for (int i = 0; i < w; ++i) { @@ -633,7 +633,7 @@ CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w return CHECKALPHA_FULL; } -CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h) { +CheckAlphaResult CheckAlphaABGR4444Basic(const u32_le *pixelData, int stride, int w, int h) { // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { #ifdef _M_SSE @@ -645,7 +645,7 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w #endif } - const u32 *p = pixelData; + const u32_le *p = (u32_le *)pixelData; const int w2 = (w + 1) / 2; const int stride2 = (stride + 1) / 2; @@ -666,7 +666,7 @@ CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w return CHECKALPHA_FULL; } -CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h) { +CheckAlphaResult CheckAlphaABGR1555Basic(const u32_le *pixelData, int stride, int w, int h) { // Use SIMD if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { #ifdef _M_SSE @@ -678,7 +678,7 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w #endif } - const u32 *p = pixelData; + const u32_le *p = (u32_le *)pixelData; const int w2 = (w + 1) / 2; const int stride2 = (stride + 1) / 2; @@ -698,7 +698,7 @@ CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w return CHECKALPHA_FULL; } -CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h) { +CheckAlphaResult CheckAlphaRGBA4444Basic(const u32_le *pixelData, int stride, int w, int h) { // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { #ifdef _M_SSE @@ -710,7 +710,7 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w #endif } - const u32 *p = pixelData; + const u32_le *p = (u32_le*)pixelData; const int w2 = (w + 1) / 2; const int stride2 = (stride + 1) / 2; @@ -731,7 +731,7 @@ CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w return CHECKALPHA_FULL; } -CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h) { +CheckAlphaResult CheckAlphaRGBA5551Basic(const u32_le *pixelData, int stride, int w, int h) { // Use SSE if aligned to 16 bytes / 8 pixels (usually the case.) if ((w & 7) == 0 && (stride & 7) == 0) { #ifdef _M_SSE @@ -743,7 +743,7 @@ CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w #endif } - const u32 *p = pixelData; + const u32_le *p = (u32_le*)pixelData; const int w2 = (w + 1) / 2; const int stride2 = (stride + 1) / 2; diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index 02e7939cdc21..34ca8cdea2b5 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -60,11 +60,11 @@ typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int b extern UnswizzleTex16Func DoUnswizzleTex16; #endif -CheckAlphaResult CheckAlphaRGBA8888Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaABGR4444Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaRGBA4444Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaABGR1555Basic(const u32 *pixelData, int stride, int w, int h); -CheckAlphaResult CheckAlphaRGBA5551Basic(const u32 *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA8888Basic(const u32_le *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR4444Basic(const u32_le *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA4444Basic(const u32_le *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaABGR1555Basic(const u32_le *pixelData, int stride, int w, int h); +CheckAlphaResult CheckAlphaRGBA5551Basic(const u32_le *pixelData, int stride, int w, int h); // All these DXT structs are in the reverse order, as compared to PC. // On PC, alpha comes before color, and interpolants are before the tile data. @@ -87,9 +87,9 @@ struct DXT5Block { u8 alpha1; u8 alpha2; }; -void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha); -void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int height); -void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int height); +void DecodeDXT1Block(u32_le *dst, const DXT1Block *src, int pitch, int height, bool ignore1bitAlpha); +void DecodeDXT3Block(u32_le *dst, const DXT3Block *src, int pitch, int height); +void DecodeDXT5Block(u32_le *dst, const DXT5Block *src, int pitch, int height); static const u8 textureBitsPerPixel[16] = { 16, //GE_TFMT_5650, @@ -160,35 +160,23 @@ inline void DeIndexTexture4(ClutT *dest, const u8 *indexed, int length, const Cl } } -template -inline void DeIndexTexture4Optimal(ClutT *dest, const u8 *indexed, int length, ClutT color) { - for (int i = 0; i < length; i += 2) { - u8 index = *indexed++; - dest[i + 0] = color | ((index >> 0) & 0xf); - dest[i + 1] = color | ((index >> 4) & 0xf); - } -} - -template <> -inline void DeIndexTexture4Optimal(u16 *dest, const u8 *indexed, int length, u16 color) { +inline void DeIndexTexture4Optimal(u16_le *dest, const u8 *indexed, int length, u16 color) { const u16_le *indexed16 = (const u16_le *)indexed; - const u32 color32 = (color << 16) | color; - u32 *dest32 = (u32 *)dest; - for (int i = 0; i < length / 2; i += 2) { - u16 index = *indexed16++; - dest32[i + 0] = color32 | ((index & 0x00f0) << 12) | ((index & 0x000f) >> 0); - dest32[i + 1] = color32 | ((index & 0xf000) << 4) | ((index & 0x0f00) >> 8); + const u64 color64 = ((u64)color << 48) | ((u64)color << 32) | ((u64)color << 16) | (u64)color; + u64_le *dest64 = (u64_le *)dest; + for (int i = 0; i < length / 4; i ++) { + u64 index = *indexed16++; + dest64[i] = color64 | ((index & 0xf000) << 36) | ((index & 0x0f00) << 24) | ((index & 0x00f0) << 12) | ((index & 0x000f) << 0); } } -inline void DeIndexTexture4OptimalRev(u16 *dest, const u8 *indexed, int length, u16 color) { +inline void DeIndexTexture4OptimalRev(u16_le *dest, const u8 *indexed, int length, u16 color) { const u16_le *indexed16 = (const u16_le *)indexed; - const u32 color32 = (color << 16) | color; - u32 *dest32 = (u32 *)dest; - for (int i = 0; i < length / 2; i += 2) { - u16 index = *indexed16++; - dest32[i + 0] = color32 | ((index & 0x00f0) << 24) | ((index & 0x000f) << 12); - dest32[i + 1] = color32 | ((index & 0xf000) << 16) | ((index & 0x0f00) << 4); + const u64 color64 = ((u64)color << 48) | ((u64)color << 32) | ((u64)color << 16) | (u64)color; + u64_le *dest64 = (u64_le *)dest; + for (int i = 0; i < length / 4; i ++) { + u64 index = *indexed16++; + dest64[i] = color64 | ((index & 0xf000) << 48) | ((index & 0x0f00) << 36) | ((index & 0x00f0) << 24) | ((index & 0x000f) << 12); } } @@ -198,8 +186,7 @@ inline void DeIndexTexture4(ClutT *dest, const u32 texaddr, int length, const Cl DeIndexTexture4(dest, indexed, length, clut); } -template -inline void DeIndexTexture4Optimal(ClutT *dest, const u32 texaddr, int length, ClutT color) { +inline void DeIndexTexture4Optimal(u16_le *dest, const u32 texaddr, int length, u16 color) { const u8 *indexed = (const u8 *) Memory::GetPointer(texaddr); DeIndexTexture4Optimal(dest, indexed, length, color); } diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 049110546591..d11dd1e489d9 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -119,7 +119,7 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo lowerBound = value; } } else if (idx == GE_VTYPE_IDX_16BIT) { - const u16 *ind16 = (const u16 *)inds; + const u16_le *ind16 = (const u16_le *)inds; for (int i = 0; i < count; i++) { u16 value = ind16[i]; if (value > upperBound) @@ -129,7 +129,7 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo } } else if (idx == GE_VTYPE_IDX_32BIT) { WARN_LOG_REPORT_ONCE(indexBounds32, G3D, "GetIndexBounds: Decoding 32-bit indexes"); - const u32 *ind32 = (const u32 *)inds; + const u32_le *ind32 = (const u32_le *)inds; for (int i = 0; i < count; i++) { u16 value = (u16)ind32[i]; // These aren't documented and should be rare. Let's bounds check each one. @@ -193,7 +193,7 @@ void VertexDecoder::Step_WeightsU8() const void VertexDecoder::Step_WeightsU16() const { u16 *wt = (u16 *)(decoded_ + decFmt.w0off); - const u16 *wdata = (const u16*)(ptr_); + const u16_le *wdata = (const u16_le*)(ptr_); int j; for (j = 0; j < nweights; j++) wt[j] = wdata[j]; @@ -216,7 +216,7 @@ void VertexDecoder::Step_WeightsU8ToFloat() const void VertexDecoder::Step_WeightsU16ToFloat() const { float *wt = (float *)(decoded_ + decFmt.w0off); - const u16 *wdata = (const u16*)(ptr_); + const u16_le *wdata = (const u16_le*)(ptr_); int j; for (j = 0; j < nweights; j++) { wt[j] = (float)wdata[j] * (1.0f / 32768.0f); @@ -231,7 +231,7 @@ void VertexDecoder::Step_WeightsU16ToFloat() const void VertexDecoder::Step_WeightsFloat() const { float *wt = (float *)(decoded_ + decFmt.w0off); - const float *wdata = (const float*)(ptr_); + const float_le *wdata = (const float_le*)(ptr_); int j; for (j = 0; j < nweights; j++) { wt[j] = wdata[j]; @@ -261,7 +261,7 @@ void VertexDecoder::Step_WeightsU8Skin() const { } void VertexDecoder::Step_WeightsU16Skin() const { - const u16 *wdata = (const u16*)(ptr_); + const u16_le *wdata = (const u16_le*)(ptr_); float weights[8]; for (int j = 0; j < nweights; j++) weights[j] = wdata[j] * (1.0f / 32768.0f); @@ -269,8 +269,11 @@ void VertexDecoder::Step_WeightsU16Skin() const { } void VertexDecoder::Step_WeightsFloatSkin() const { - const float *wdata = (const float*)(ptr_); - ComputeSkinMatrix(wdata); + const float_le *wdata = (const float_le*)(ptr_); + float weights[8]; + for (int j = 0; j < nweights; j++) + weights[j] = wdata[j]; + ComputeSkinMatrix(weights); } void VertexDecoder::Step_TcU8ToFloat() const @@ -285,7 +288,7 @@ void VertexDecoder::Step_TcU8ToFloat() const void VertexDecoder::Step_TcU16ToFloat() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff); uv[0] = uvdata[0] * (1.0f / 32768.0f); uv[1] = uvdata[1] * (1.0f / 32768.0f); } @@ -293,7 +296,7 @@ void VertexDecoder::Step_TcU16ToFloat() const void VertexDecoder::Step_TcU16DoubleToFloat() const { float *uv = (float*)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff); uv[0] = uvdata[0] * (1.0f / 16384.0f); uv[1] = uvdata[1] * (1.0f / 16384.0f); } @@ -301,20 +304,20 @@ void VertexDecoder::Step_TcU16DoubleToFloat() const void VertexDecoder::Step_TcU16ThroughToFloat() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff); uv[0] = uvdata[0]; uv[1] = uvdata[1]; - gstate_c.vertBounds.minU = std::min(gstate_c.vertBounds.minU, uvdata[0]); - gstate_c.vertBounds.maxU = std::max(gstate_c.vertBounds.maxU, uvdata[0]); - gstate_c.vertBounds.minV = std::min(gstate_c.vertBounds.minV, uvdata[1]); - gstate_c.vertBounds.maxV = std::max(gstate_c.vertBounds.maxV, uvdata[1]); + gstate_c.vertBounds.minU = std::min(gstate_c.vertBounds.minU, (u16)uvdata[0]); + gstate_c.vertBounds.maxU = std::max(gstate_c.vertBounds.maxU, (u16)uvdata[0]); + gstate_c.vertBounds.minV = std::min(gstate_c.vertBounds.minV, (u16)uvdata[1]); + gstate_c.vertBounds.maxV = std::max(gstate_c.vertBounds.maxV, (u16)uvdata[1]); } void VertexDecoder::Step_TcU16ThroughDoubleToFloat() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le*)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le*)(ptr_ + tcoff); uv[0] = uvdata[0] * 2; uv[1] = uvdata[1] * 2; } @@ -322,7 +325,7 @@ void VertexDecoder::Step_TcU16ThroughDoubleToFloat() const void VertexDecoder::Step_TcFloat() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const float *uvdata = (const float*)(ptr_ + tcoff); + const float_le *uvdata = (const float_le*)(ptr_ + tcoff); uv[0] = uvdata[0]; uv[1] = uvdata[1]; } @@ -330,7 +333,7 @@ void VertexDecoder::Step_TcFloat() const void VertexDecoder::Step_TcFloatThrough() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const float *uvdata = (const float*)(ptr_ + tcoff); + const float_le *uvdata = (const float_le*)(ptr_ + tcoff); uv[0] = uvdata[0]; uv[1] = uvdata[1]; @@ -363,7 +366,7 @@ void VertexDecoder::Step_TcU16DoublePrescale() const { void VertexDecoder::Step_TcFloatPrescale() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const float *uvdata = (const float*)(ptr_ + tcoff); + const float_le *uvdata = (const float_le*)(ptr_ + tcoff); uv[0] = uvdata[0] * gstate_c.uv.uScale + gstate_c.uv.uOff; uv[1] = uvdata[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; } @@ -495,41 +498,42 @@ void VertexDecoder::Step_ColorInvalid() const void VertexDecoder::Step_Color565() const { - u8 *c = decoded_ + decFmt.c0off; + u32 *c = (u32*)(decoded_ + decFmt.c0off); u16 cdata = *(u16_le *)(ptr_ + coloff); - c[0] = Convert5To8(cdata & 0x1f); - c[1] = Convert6To8((cdata >> 5) & 0x3f); - c[2] = Convert5To8((cdata >> 11) & 0x1f); - c[3] = 255; + *c = Convert5To8(cdata & 0x1f); + *c |= Convert6To8((cdata >> 5) & 0x3f) << 8; + *c |= Convert5To8((cdata >> 11) & 0x1f) << 16; + *c |= 255 << 24; // Always full alpha. } void VertexDecoder::Step_Color5551() const { - u8 *c = decoded_ + decFmt.c0off; + u32 *c = (u32*)(decoded_ + decFmt.c0off); u16 cdata = *(u16_le *)(ptr_ + coloff); gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 15) != 0; - c[0] = Convert5To8(cdata & 0x1f); - c[1] = Convert5To8((cdata >> 5) & 0x1f); - c[2] = Convert5To8((cdata >> 10) & 0x1f); - c[3] = (cdata >> 15) ? 255 : 0; + *c = Convert5To8(cdata & 0x1f); + *c |= Convert5To8((cdata >> 5) & 0x1f) << 8; + *c |= Convert5To8((cdata >> 10) & 0x1f) << 16; + *c |= (cdata >> 15) ? 255 << 24 : 0; } void VertexDecoder::Step_Color4444() const { - u8 *c = decoded_ + decFmt.c0off; + u32 *c = (u32*)(decoded_ + decFmt.c0off); u16 cdata = *(u16_le *)(ptr_ + coloff); gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 12) == 0xF; + *c = 0; for (int j = 0; j < 4; j++) - c[j] = Convert4To8((cdata >> (j * 4)) & 0xF); + *c |= Convert4To8((cdata >> (j * 4)) & 0xF) << (j * 8); } void VertexDecoder::Step_Color8888() const { - u8 *c = decoded_ + decFmt.c0off; - const u8 *cdata = (const u8*)(ptr_ + coloff); - gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && cdata[3] == 255; - memcpy(c, cdata, sizeof(u8) * 4); + u32 *c = (u32*)(decoded_ + decFmt.c0off); + u32 cdata = *(u32_le*)(ptr_ + coloff); + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 24) == 0xFF; + *c = cdata; } void VertexDecoder::Step_Color565Morph() const @@ -621,7 +625,7 @@ void VertexDecoder::Step_NormalS8ToFloat() const void VertexDecoder::Step_NormalS16() const { s16 *normal = (s16 *)(decoded_ + decFmt.nrmoff); - const s16 *sv = (const s16_le*)(ptr_ + nrmoff); + const s16_le *sv = (const s16_le*)(ptr_ + nrmoff); for (int j = 0; j < 3; j++) normal[j] = sv[j]; normal[3] = 0; @@ -630,7 +634,7 @@ void VertexDecoder::Step_NormalS16() const void VertexDecoder::Step_NormalFloat() const { u32 *normal = (u32 *)(decoded_ + decFmt.nrmoff); - const u32 *fv = (const u32_le*)(ptr_ + nrmoff); + const u32_le *fv = (const u32_le*)(ptr_ + nrmoff); for (int j = 0; j < 3; j++) normal[j] = fv[j]; } @@ -646,15 +650,16 @@ void VertexDecoder::Step_NormalS8Skin() const void VertexDecoder::Step_NormalS16Skin() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); - const s16 *sv = (const s16_le*)(ptr_ + nrmoff); - const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) }; + const s16_le *sv = (const s16_le*)(ptr_ + nrmoff); + const float fn[3] = { (float)sv[0] * (1.0f / 32768.0f), (float)sv[1] * (1.0f / 32768.0f), (float)sv[2] * (1.0f / 32768.0f) }; Norm3ByMatrix43(normal, fn, skinMatrix); } void VertexDecoder::Step_NormalFloatSkin() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); - const float *fn = (const float *)(ptr_ + nrmoff); + const float_le *sv = (const float_le*)(ptr_ + nrmoff); + const float fn[3] = { sv[0], sv[1], sv[2] }; Norm3ByMatrix43(normal, fn, skinMatrix); } @@ -675,7 +680,7 @@ void VertexDecoder::Step_NormalS16Morph() const float *normal = (float *)(decoded_ + decFmt.nrmoff); memset(normal, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { - const s16 *sv = (const s16_le *)(ptr_ + onesize_*n + nrmoff); + const s16_le *sv = (const s16_le *)(ptr_ + onesize_*n + nrmoff); const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f); for (int j = 0; j < 3; j++) normal[j] += sv[j] * multiplier; @@ -688,7 +693,7 @@ void VertexDecoder::Step_NormalFloatMorph() const memset(normal, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { float multiplier = gstate_c.morphWeights[n]; - const float *fv = (const float*)(ptr_ + onesize_*n + nrmoff); + const float_le *fv = (const float_le*)(ptr_ + onesize_*n + nrmoff); for (int j = 0; j < 3; j++) normal[j] += fv[j] * multiplier; } @@ -710,7 +715,7 @@ void VertexDecoder::Step_NormalS16MorphSkin() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); float nrm[3]{}; for (int n = 0; n < morphcount; n++) { - const s16 *sv = (const s16_le *)(ptr_ + onesize_ * n + nrmoff); + const s16_le *sv = (const s16_le *)(ptr_ + onesize_ * n + nrmoff); const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f); for (int j = 0; j < 3; j++) nrm[j] += sv[j] * multiplier; @@ -723,7 +728,7 @@ void VertexDecoder::Step_NormalFloatMorphSkin() const { float nrm[3]{}; for (int n = 0; n < morphcount; n++) { float multiplier = gstate_c.morphWeights[n]; - const float *fv = (const float*)(ptr_ + onesize_ * n + nrmoff); + const float_le *fv = (const float_le*)(ptr_ + onesize_ * n + nrmoff); for (int j = 0; j < 3; j++) nrm[j] += fv[j] * multiplier; } @@ -741,16 +746,17 @@ void VertexDecoder::Step_PosS8() const void VertexDecoder::Step_PosS16() const { float *pos = (float *)(decoded_ + decFmt.posoff); - const s16 *sv = (const s16_le *)(ptr_ + posoff); + const s16_le *sv = (const s16_le *)(ptr_ + posoff); for (int j = 0; j < 3; j++) pos[j] = sv[j] * (1.0f / 32768.0f); } void VertexDecoder::Step_PosFloat() const { - u8 *v = (u8 *)(decoded_ + decFmt.posoff); - const u8 *fv = (const u8*)(ptr_ + posoff); - memcpy(v, fv, 12); + float *pos = (float *)(decoded_ + decFmt.posoff); + const float_le *fv = (const float_le *)(ptr_ + posoff); + for (int j = 0; j < 3; j++) + pos[j] = fv[j]; } void VertexDecoder::Step_PosS8Skin() const @@ -765,14 +771,15 @@ void VertexDecoder::Step_PosS16Skin() const { float *pos = (float *)(decoded_ + decFmt.posoff); const s16_le *sv = (const s16_le *)(ptr_ + posoff); - const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) }; + const float fn[3] = { (float)sv[0] * (1.0f / 32768.0f), (float)sv[1] * (1.0f / 32768.0f), (float)sv[2] * (1.0f / 32768.0f) }; Vec3ByMatrix43(pos, fn, skinMatrix); } void VertexDecoder::Step_PosFloatSkin() const { float *pos = (float *)(decoded_ + decFmt.posoff); - const float *fn = (const float *)(ptr_ + posoff); + const float_le *sv = (const float_le*)(ptr_ + posoff); + const float fn[3] = { sv[0], sv[1], sv[2] }; Vec3ByMatrix43(pos, fn, skinMatrix); } @@ -797,9 +804,11 @@ void VertexDecoder::Step_PosS16Through() const void VertexDecoder::Step_PosFloatThrough() const { - u8 *v = (u8 *)(decoded_ + decFmt.posoff); - const u8 *fv = (const u8 *)(ptr_ + posoff); - memcpy(v, fv, 12); + float *v = (float *)(decoded_ + decFmt.posoff); + const float_le *fv = (const float_le*)(ptr_ + posoff); + v[0] = fv[0]; + v[1] = fv[1]; + v[2] = fv[2]; } void VertexDecoder::Step_PosS8Morph() const @@ -820,7 +829,7 @@ void VertexDecoder::Step_PosS16Morph() const memset(v, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { const float multiplier = 1.0f / 32768.0f; - const s16 *sv = (const s16*)(ptr_ + onesize_*n + posoff); + const s16_le *sv = (const s16_le*)(ptr_ + onesize_*n + posoff); for (int j = 0; j < 3; j++) v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]); } @@ -831,7 +840,7 @@ void VertexDecoder::Step_PosFloatMorph() const float *v = (float *)(decoded_ + decFmt.posoff); memset(v, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { - const float *fv = (const float*)(ptr_ + onesize_*n + posoff); + const float_le *fv = (const float_le*)(ptr_ + onesize_*n + posoff); for (int j = 0; j < 3; j++) v[j] += fv[j] * gstate_c.morphWeights[n]; } @@ -854,7 +863,7 @@ void VertexDecoder::Step_PosS16MorphSkin() const { float pos[3]{}; for (int n = 0; n < morphcount; n++) { const float multiplier = 1.0f / 32768.0f; - const s16 *sv = (const s16*)(ptr_ + onesize_ * n + posoff); + const s16_le *sv = (const s16_le*)(ptr_ + onesize_ * n + posoff); for (int j = 0; j < 3; j++) pos[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]); } @@ -865,7 +874,7 @@ void VertexDecoder::Step_PosFloatMorphSkin() const { float *v = (float *)(decoded_ + decFmt.posoff); float pos[3]{}; for (int n = 0; n < morphcount; n++) { - const float *fv = (const float*)(ptr_ + onesize_ * n + posoff); + const float_le *fv = (const float_le*)(ptr_ + onesize_ * n + posoff); for (int j = 0; j < 3; j++) pos[j] += fv[j] * gstate_c.morphWeights[n]; } @@ -1352,6 +1361,8 @@ std::string VertexDecoder::GetString(DebugShaderStringType stringType) { lines = DisassembleArm2((const u8 *)jitted_, jittedSize_); #elif PPSSPP_ARCH(MIPS) || PPSSPP_ARCH(MIPS64) // No MIPS disassembler defined +#elif defined(__wiiu__) + lines = DisassemblePPC((const u8 *)jitted_, jittedSize_); #else lines = DisassembleX86((const u8 *)jitted_, jittedSize_); #endif diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 333f2b1c3678..b27b2652982b 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -37,10 +37,10 @@ #include "Common/x64Emitter.h" #elif PPSSPP_ARCH(MIPS) #include "Common/MipsEmitter.h" -#else -#include "Common/FakeEmitter.h" #endif +#include "Common/FakeEmitter.h" + // DecVtxFormat - vertex formats for PC // Kind of like a D3D VertexDeclaration. // Can write code to easily bind these using OpenGL, or read these manually. @@ -93,8 +93,8 @@ class IndexConverter { union { const void *indices; const u8 *indices8; - const u16 *indices16; - const u32 *indices32; + const u16_le *indices16; + const u32_le *indices32; }; u32 indexType; diff --git a/GPU/Common/VertexDecoderFake.cpp b/GPU/Common/VertexDecoderFake.cpp index af27e695612e..dfccd0932a2c 100644 --- a/GPU/Common/VertexDecoderFake.cpp +++ b/GPU/Common/VertexDecoderFake.cpp @@ -15,8 +15,6 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. -//TODO: Doesn't build, FIXME! -#if 0 #include "Common/CPUDetect.h" #include "Core/Config.h" @@ -43,18 +41,13 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin}, {&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin}, - {&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8}, - {&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16}, {&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat}, - {&VertexDecoder::Step_TcU16Double, &VertexDecoderJitCache::Jit_TcU16Double}, {&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale}, {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, - {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, @@ -126,27 +119,12 @@ void VertexDecoderJitCache::Jit_WeightsU16Skin() { void VertexDecoderJitCache::Jit_WeightsFloatSkin() { } -void VertexDecoderJitCache::Jit_TcU8() { -} - -void VertexDecoderJitCache::Jit_TcU16() { -} - void VertexDecoderJitCache::Jit_TcFloat() { } -void VertexDecoderJitCache::Jit_TcU16Through() { -} - void VertexDecoderJitCache::Jit_TcFloatThrough() { } -void VertexDecoderJitCache::Jit_TcU16Double() { -} - -void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { -} - void VertexDecoderJitCache::Jit_TcU8Prescale() { } @@ -271,5 +249,3 @@ void VertexDecoderJitCache::Jit_NormalFloatMorph() { bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) { return false; } - -#endif diff --git a/GPU/D3D11/TextureScalerD3D11.cpp b/GPU/D3D11/TextureScalerD3D11.cpp index 32c06c45ee56..3dbfd6b7b067 100644 --- a/GPU/D3D11/TextureScalerD3D11.cpp +++ b/GPU/D3D11/TextureScalerD3D11.cpp @@ -39,19 +39,19 @@ void TextureScalerD3D11::ConvertTo8888(u32 format, u32* source, u32* &dest, int break; case DXGI_FORMAT_B4G4R4A4_UNORM: - GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case DXGI_FORMAT_B5G6R5_UNORM: - GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case DXGI_FORMAT_B5G5R5A1_UNORM: - GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; default: dest = source; ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format"); } -} \ No newline at end of file +} diff --git a/GPU/Debugger/Playback.cpp b/GPU/Debugger/Playback.cpp index 6a4e2c2e97da..61cf5ae4ad67 100644 --- a/GPU/Debugger/Playback.cpp +++ b/GPU/Debugger/Playback.cpp @@ -304,7 +304,7 @@ class DumpExecute { u32 execListPos = 0; u32 execListID = 0; const int LIST_BUF_SIZE = 256 * 1024; - std::vector execListQueue; + std::vector execListQueue; u16 lastBufw_[8]{}; const std::vector &pushbuf_; diff --git a/GPU/Directx9/TextureScalerDX9.cpp b/GPU/Directx9/TextureScalerDX9.cpp index a31dbd3af4fa..8fb32212a739 100644 --- a/GPU/Directx9/TextureScalerDX9.cpp +++ b/GPU/Directx9/TextureScalerDX9.cpp @@ -47,15 +47,15 @@ void TextureScalerDX9::ConvertTo8888(u32 format, u32* source, u32* &dest, int wi break; case D3DFMT_A4R4G4B4: - GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case D3DFMT_R5G6B5: - GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case D3DFMT_A1R5G5B5: - GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; default: diff --git a/GPU/GPU.cpp b/GPU/GPU.cpp index f731ee6cbb00..a71b19f19d8c 100644 --- a/GPU/GPU.cpp +++ b/GPU/GPU.cpp @@ -39,6 +39,10 @@ #include "GPU/D3D11/GPU_D3D11.h" #endif +#if PPSSPP_API(GX2) +#include "GPU/GX2/GPU_GX2.h" +#endif + GPUStatistics gpuStats; GPUInterface *gpu; GPUDebugInterface *gpuDebug; @@ -101,6 +105,13 @@ bool GPU_Init(GraphicsContext *ctx, Draw::DrawContext *draw) { } SetGPU(new GPU_Vulkan(ctx, draw)); break; + case GPUCORE_GX2: +#if PPSSPP_PLATFORM(WIIU) + SetGPU(new GPU_GX2(ctx, draw)); + break; +#else + return false; +#endif } return gpu != NULL; diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index 32bcb5ec6653..c1514077bf18 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -657,7 +657,7 @@ int GPUCommon::GetStack(int index, u32 stackPtr) { } if (index >= 0) { - auto stack = PSPPointer::Create(stackPtr); + auto stack = PSPPointer::Create(stackPtr); if (stack.IsValid()) { auto entry = currentList->stack[index]; // Not really sure what most of these values are. @@ -688,7 +688,7 @@ u32 GPUCommon::EnqueueList(u32 listpc, u32 stall, int subIntrBase, PSPPointersize >= 16 ? args->stackAddr : 0; + u32 stackAddr = args.IsValid() && args->size >= 16 ? (u32)args->stackAddr : 0; // Check compatibility if (sceKernelGetCompiledSdkVersion() > 0x01FFFFFF) { //numStacks = 0; @@ -1028,7 +1028,7 @@ void GPUCommon::FastRunLoop(DisplayList &list) { int dc = downcount; for (; dc > 0; --dc) { // We know that display list PCs have the upper nibble == 0 - no need to mask the pointer - const u32 op = *(const u32 *)(Memory::base + list.pc); + const u32 op = Memory::ReadUnchecked_U32(list.pc); const u32 cmd = op >> 24; const CommandInfo &info = cmdInfo[cmd]; const u32 diff = op ^ gstate.cmdmem[cmd]; @@ -1625,8 +1625,8 @@ void GPUCommon::Execute_Prim(u32 op, u32 diff) { int totalVertCount = count; // PRIMs are often followed by more PRIMs. Save some work and submit them immediately. - const u32 *src = (const u32 *)Memory::GetPointerUnchecked(currentList->pc + 4); - const u32 *stall = currentList->stall ? (const u32 *)Memory::GetPointerUnchecked(currentList->stall) : 0; + const u32_le *src = (const u32_le *)Memory::GetPointerUnchecked(currentList->pc + 4); + const u32_le *stall = currentList->stall ? (const u32_le *)Memory::GetPointerUnchecked(currentList->stall) : 0; int cmdCount = 0; // Optimized submission of sequences of PRIM. Allows us to avoid going through all the mess @@ -2275,8 +2275,8 @@ void GPUCommon::FlushImm() { // through vertices. // Since the only known use is Thrillville and it only uses it to clear, we just use color and pos. struct ImmVertex { - uint32_t color; - float xyz[3]; + u32_le color; + float_le xyz[3]; }; ImmVertex temp[MAX_IMMBUFFER_SIZE]; for (int i = 0; i < immCount_; i++) { diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h index 2dd4d7da1bbe..285fd32e6fda 100644 --- a/GPU/GPUCommon.h +++ b/GPU/GPUCommon.h @@ -40,6 +40,7 @@ enum { }; struct TransformedVertex { + TransformedVertex() {} union { struct { float x, y, z, fog; // in case of morph, preblend during decode @@ -54,11 +55,11 @@ struct TransformedVertex { }; union { u8 color0[4]; // prelit - u32 color0_32; + u32_le color0_32; }; union { u8 color1[4]; // prelit - u32 color1_32; + u32_le color1_32; }; }; diff --git a/GPU/GX2/DepalettizeShaderGX2.cpp b/GPU/GX2/DepalettizeShaderGX2.cpp new file mode 100644 index 000000000000..010d75d2e8f5 --- /dev/null +++ b/GPU/GX2/DepalettizeShaderGX2.cpp @@ -0,0 +1,196 @@ +// Copyright (c) 2014- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include +#include + +#include "base/basictypes.h" +#include "Common/Log.h" +#include "Common/ColorConv.h" +#include "Common/StringUtils.h" +#include "Core/Reporting.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/DepalettizeShaderGX2.h" +#include "GPU/GX2/GX2Util.h" +#include "GPU/Common/DepalettizeShaderCommon.h" + +DepalShaderCacheGX2::DepalShaderCacheGX2(Draw::DrawContext *draw) { + static const GX2AttribStream depalAttribStream[] = { + { 0, 0, 0, GX2_ATTRIB_FORMAT_FLOAT_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_DEFAULT }, + { 1, 0, 12, GX2_ATTRIB_FORMAT_FLOAT_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _0, _0), GX2_ENDIAN_SWAP_DEFAULT }, + }; + fetchShader_.size = GX2CalcFetchShaderSizeEx(countof(depalAttribStream), GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE); + fetchShader_.program = (u8 *)MEM2_alloc(fetchShader_.size, GX2_SHADER_ALIGNMENT); + GX2InitFetchShaderEx(&fetchShader_, fetchShader_.program, countof(depalAttribStream), depalAttribStream, GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, fetchShader_.program, fetchShader_.size); + + context_ = (GX2ContextState *)draw->GetNativeObject(Draw::NativeObject::CONTEXT); +} + +DepalShaderCacheGX2::~DepalShaderCacheGX2() { + Clear(); + MEM2_free(fetchShader_.program); +} + +GX2Texture *DepalShaderCacheGX2::GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32_le *rawClut, bool expandTo32bit) { + const u32 clutId = GetClutID(clutFormat, clutHash); + + auto oldtex = texCache_.find(clutId); + if (oldtex != texCache_.end()) { + oldtex->second->lastFrame = gpuStats.numFlips; + return oldtex->second; + } + + int texturePixels = clutFormat == GE_CMODE_32BIT_ABGR8888 ? 256 : 512; + int bpp = clutFormat == GE_CMODE_32BIT_ABGR8888 ? 4 : 2; + GX2SurfaceFormat dstFmt; + u32_le *expanded = nullptr; + if (expandTo32bit && clutFormat != GE_CMODE_32BIT_ABGR8888) { + expanded = new u32_le[texturePixels]; + switch (clutFormat) { + case GE_CMODE_16BIT_ABGR4444: + ConvertRGBA4444ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); + break; + case GE_CMODE_16BIT_ABGR5551: + ConvertRGBA5551ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); + break; + case GE_CMODE_16BIT_BGR5650: + ConvertRGB565ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); + break; + } + rawClut = expanded; + dstFmt = GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + bpp = 4; + } + else { + dstFmt = GetClutDestFormatGX2(clutFormat); + } + + DepalTextureGX2 *tex = new DepalTextureGX2(); + + tex->surface.width = texturePixels; + tex->surface.height = 1; + tex->surface.depth = 1; + tex->surface.dim = GX2_SURFACE_DIM_TEXTURE_1D; + tex->surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + tex->surface.use = GX2_SURFACE_USE_TEXTURE; + tex->viewNumSlices = 1; + + tex->surface.format = dstFmt; + tex->compMap = GX2_COMP_SEL(_a, _r, _g, _b); + + GX2CalcSurfaceSizeAndAlignment(&tex->surface); + GX2InitTextureRegs(tex); + + tex->surface.image = MEM2_alloc(tex->surface.imageSize, tex->surface.alignment); + _assert_(tex->surface.image); + + if (bpp == 2) { + const u16_le *src = (const u16_le *)rawClut; + u16_le *dst = (u16_le *)tex->surface.image; + while (src < (u16_le *)rawClut + texturePixels) { + *dst++ = (*src++); + } + } else { + const u32_le *src = (const u32_le *)rawClut; + u32_le *dst = (u32_le *)tex->surface.image; + while (src < (u32_le *)rawClut + texturePixels) { + *dst++ = (*src++); + } + } + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, tex->surface.image, tex->surface.imageSize); + + tex->lastFrame = gpuStats.numFlips; + texCache_[clutId] = tex; + + if (expandTo32bit) { + delete[] expanded; + } + return tex; +} + +void DepalShaderCacheGX2::Clear() { + for (auto shader = cache_.begin(); shader != cache_.end(); ++shader) { + delete shader->second; + } + cache_.clear(); + + for (auto tex = texCache_.begin(); tex != texCache_.end(); ++tex) { + delete tex->second; + } + texCache_.clear(); +} + +void DepalShaderCacheGX2::Decimate() { + for (auto tex = texCache_.begin(); tex != texCache_.end();) { + if (tex->second->lastFrame + DEPAL_TEXTURE_OLD_AGE < gpuStats.numFlips) { + delete tex->second; + texCache_.erase(tex++); + } else { + ++tex; + } + } +} + +extern "C" GX2PixelShader GX2_fsCol; +DepalShaderCacheGX2::DepalShaderGX2::DepalShaderGX2(GEBufferFormat pixelFormat) : GX2PixelShader(GX2_fsCol) { + // TODO; + program = (u8*)MEM2_alloc(size, GX2_SHADER_ALIGNMENT); + memcpy(program, GX2_fsCol.program, size); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, program, size); +} + +GX2PixelShader *DepalShaderCacheGX2::GetDepalettizePixelShader(u32 clutMode, GEBufferFormat pixelFormat) { + // TODO: + return nullptr; + u32 id = GenerateShaderID(clutMode, pixelFormat); + + auto shader = cache_.find(id); + if (shader != cache_.end()) { + return shader->second; + } + + DepalShaderGX2 *depal = new DepalShaderGX2(pixelFormat); + cache_[id] = depal; + + return depal; +} + +std::vector DepalShaderCacheGX2::DebugGetShaderIDs(DebugShaderType type) { + std::vector ids; + for (auto &iter : cache_) { + ids.push_back(StringFromFormat("%08x", iter.first)); + } + return ids; +} + +std::string DepalShaderCacheGX2::DebugGetShaderString(std::string idstr, DebugShaderType type, DebugShaderStringType stringType) { + u32 id; + sscanf(idstr.c_str(), "%08x", &id); + auto iter = cache_.find(id); + if (iter == cache_.end()) + return ""; + switch (stringType) { + case SHADER_STRING_SHORT_DESC: + return idstr; + case SHADER_STRING_SOURCE_CODE: + // TODO: disassemble shader + return "N/A"; + default: + return ""; + } +} diff --git a/GPU/GX2/DepalettizeShaderGX2.h b/GPU/GX2/DepalettizeShaderGX2.h new file mode 100644 index 000000000000..c713850a1dca --- /dev/null +++ b/GPU/GX2/DepalettizeShaderGX2.h @@ -0,0 +1,63 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include +#include +#include +#include +#include + +#include "Common/CommonTypes.h" +#include "GPU/ge_constants.h" +#include "thin3d/thin3d.h" +#include "GPU/Common/DepalettizeShaderCommon.h" +#include "GPU/GX2/GX2Shaders.h" + +// Caches both shaders and palette textures. +class DepalShaderCacheGX2 : public DepalShaderCacheCommon { +public: + DepalShaderCacheGX2(Draw::DrawContext *draw); + ~DepalShaderCacheGX2(); + + // This also uploads the palette and binds the correct texture. + GX2PixelShader *GetDepalettizePixelShader(u32 clutMode, GEBufferFormat pixelFormat); + GX2VertexShader *GetDepalettizeVertexShader() { return &defVShaderGX2; } + GX2FetchShader *GetFetchShader() { return &fetchShader_; } + GX2Texture *GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32_le *rawClut, bool expandTo32bit); + void Clear(); + void Decimate(); + std::vector DebugGetShaderIDs(DebugShaderType type); + std::string DebugGetShaderString(std::string id, DebugShaderType type, DebugShaderStringType stringType); + +private: + struct DepalShaderGX2 : public GX2PixelShader { + DepalShaderGX2(GEBufferFormat pixelFormat); + ~DepalShaderGX2() { MEM2_free(program); } + }; + + struct DepalTextureGX2 : public GX2Texture { + DepalTextureGX2() : GX2Texture({}) {} + ~DepalTextureGX2() { MEM2_free(surface.image); } + int lastFrame; + }; + + GX2ContextState *context_; + GX2FetchShader fetchShader_ = {}; + + std::map cache_; + std::map texCache_; +}; diff --git a/GPU/GX2/DrawEngineGX2.cpp b/GPU/GX2/DrawEngineGX2.cpp new file mode 100644 index 000000000000..147eb5f77504 --- /dev/null +++ b/GPU/GX2/DrawEngineGX2.cpp @@ -0,0 +1,779 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include + +#include "profiler/profiler.h" + +#include "Common/MemoryUtil.h" +#include "Core/MemMap.h" +#include "Core/Host.h" +#include "Core/System.h" +#include "Core/Reporting.h" +#include "Core/Config.h" +#include "Core/CoreTiming.h" + +#include "GPU/Math3D.h" +#include "GPU/GPUState.h" +#include "GPU/ge_constants.h" + +#include "GPU/Common/TextureDecoder.h" +#include "GPU/Common/SplineCommon.h" + +#include "GPU/Common/TransformCommon.h" +#include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/SoftwareTransformCommon.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/DrawEngineGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/GPU_GX2.h" + +static const GX2PrimitiveMode GX2prim[8] = { + GX2_PRIMITIVE_MODE_POINTS, + GX2_PRIMITIVE_MODE_LINES, + GX2_PRIMITIVE_MODE_LINE_STRIP, + GX2_PRIMITIVE_MODE_TRIANGLES, + GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, + GX2_PRIMITIVE_MODE_TRIANGLES, + GX2_PRIMITIVE_MODE_TRIANGLES, + GX2_PRIMITIVE_MODE_INVALID +}; + +#define VERTEXCACHE_DECIMATION_INTERVAL 17 + +enum { VAI_KILL_AGE = 120, VAI_UNRELIABLE_KILL_AGE = 240, VAI_UNRELIABLE_KILL_MAX = 4 }; +enum { + VERTEX_PUSH_SIZE = 1024 * 1024 * 16, + INDEX_PUSH_SIZE = 1024 * 1024 * 4, + UBO_PUSH_SIZE = 1024 * 1024 * 16, +}; + +static const GX2AttribStream TransformedVertexElements[] = { + { 0, 0, 0, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT }, + { 1, 0, 16, GX2_ATTRIB_FORMAT_FLOAT_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_DEFAULT }, + { 2, 0, 28, GX2_ATTRIB_FORMAT_UNORM_8_8_8_8, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_r, _g, _b, _a), GX2_ENDIAN_SWAP_8_IN_32 }, + { 3, 0, 32, GX2_ATTRIB_FORMAT_UNORM_8_8_8_8, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_r, _g, _b, _a), GX2_ENDIAN_SWAP_8_IN_32 }, +}; + +DrawEngineGX2::DrawEngineGX2(Draw::DrawContext *draw, GX2ContextState *context) + : draw_(draw), + context_(context), + vai_(256), + fetchShaderMap_(32), + blendCache_(32), + depthStencilCache_(64), + rasterCache_(4) +{ + decOptions_.expandAllWeightsToFloat = true; + decOptions_.expand8BitNormalsToFloat = true; + + decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; + + // All this is a LOT of memory, need to see if we can cut down somehow. + decoded = (u8 *)MEM2_alloc(DECODED_VERTEX_BUFFER_SIZE, GX2_VERTEX_BUFFER_ALIGNMENT); + decIndex = (u16 *)MEM2_alloc(DECODED_INDEX_BUFFER_SIZE, GX2_INDEX_BUFFER_ALIGNMENT); + + indexGen.Setup(decIndex); + + InitDeviceObjects(); +} + +DrawEngineGX2::~DrawEngineGX2() { + DestroyDeviceObjects(); + MEM2_free(decoded); + MEM2_free(decIndex); +} + +void DrawEngineGX2::InitDeviceObjects() { + pushVerts_ = new PushBufferGX2(VERTEX_PUSH_SIZE, GX2_VERTEX_BUFFER_ALIGNMENT, GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER); + pushInds_ = new PushBufferGX2(INDEX_PUSH_SIZE, GX2_INDEX_BUFFER_ALIGNMENT, GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER); + pushUBO_ = new PushBufferGX2(UBO_PUSH_SIZE, GX2_UNIFORM_BLOCK_ALIGNMENT, GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK); + + tessDataTransfer = new TessellationDataTransferGX2(context_); +} + +void DrawEngineGX2::ClearTrackedVertexArrays() { + vai_.Iterate([&](u32 hash, VertexArrayInfoGX2 *vai) { delete vai; }); + vai_.Clear(); +} + +void DrawEngineGX2::ClearInputLayoutMap() { + fetchShaderMap_.Iterate([&](const FetchShaderKey &key, GX2FetchShader *il) { + MEM2_free(il->program); + delete il; + }); + fetchShaderMap_.Clear(); +} + +void DrawEngineGX2::Resized() { + DrawEngineCommon::Resized(); + ClearInputLayoutMap(); +} + +void DrawEngineGX2::DestroyDeviceObjects() { + ClearTrackedVertexArrays(); + ClearInputLayoutMap(); + delete tessDataTransfer; + delete pushVerts_; + delete pushInds_; + delete pushUBO_; + tessDataTransfer = nullptr; + pushVerts_ = nullptr; + pushInds_ = nullptr; + pushUBO_ = nullptr; + depthStencilCache_.Iterate([&](const u64 &key, GX2DepthStencilControlReg *ds) { free(ds); }); + depthStencilCache_.Clear(); + blendCache_.Iterate([&](const u64 &key, GX2BlendState *bs) { free(bs); }); + blendCache_.Clear(); + rasterCache_.Iterate([&](const u32 &key, GX2RasterizerState *rs) { free(rs); }); + rasterCache_.Clear(); +} + +struct DeclTypeInfo { + u32 mask; + GX2EndianSwapMode endianSwap; + GX2AttribFormat format; +}; + +static const DeclTypeInfo VComp[] = { + { GX2_COMP_SEL(_0, _0, _0, _0), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32 }, // DEC_NONE, + { GX2_COMP_SEL(_x, _0, _0, _1), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_FLOAT_32 }, // DEC_FLOAT_1, + { GX2_COMP_SEL(_x, _y, _0, _1), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_FLOAT_32_32 }, // DEC_FLOAT_2, + { GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_FLOAT_32_32_32 }, // DEC_FLOAT_3, + { GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32 }, // DEC_FLOAT_4, + { GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_8_IN_32, GX2_ATTRIB_FORMAT_SNORM_8_8_8_8 }, // DEC_S8_3, + { GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_SNORM_16_16_16_16 }, // DEC_S16_3, + { GX2_COMP_SEL(_x, _0, _0, _1), GX2_ENDIAN_SWAP_DEFAULT, GX2_ATTRIB_FORMAT_UNORM_8 }, // DEC_U8_1, + { GX2_COMP_SEL(_x, _y, _0, _1), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_UNORM_8_8 }, // DEC_U8_2, + { GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_8_IN_32, GX2_ATTRIB_FORMAT_UNORM_8_8_8_8 }, // DEC_U8_3, + { GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_8_IN_32, GX2_ATTRIB_FORMAT_UNORM_8_8_8_8 }, // DEC_U8_4, + { GX2_COMP_SEL(_x, _0, _0, _1), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_UNORM_16 }, // DEC_U16_1, + { GX2_COMP_SEL(_x, _y, _0, _1), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_UNORM_16_16 }, // DEC_U16_2, + { GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_UNORM_16_16_16_16 }, // DEC_U16_3, + { GX2_COMP_SEL(_x, _y, _z, _w), GX2_ENDIAN_SWAP_8_IN_16, GX2_ATTRIB_FORMAT_UNORM_16_16_16_16 }, // DEC_U16_4, +}; + +static void VertexAttribSetup(GX2AttribStream *VertexElement, u8 fmt, u8 offset, u8 location) { + VertexElement->location = location; + VertexElement->buffer = 0; + VertexElement->offset = offset; + VertexElement->format = VComp[fmt & 0xF].format; + VertexElement->type = GX2_ATTRIB_INDEX_PER_VERTEX; + VertexElement->aluDivisor = 0; + VertexElement->mask = VComp[fmt & 0xF].mask; + VertexElement->endianSwap = VComp[fmt & 0xF].endianSwap; +} + +GX2FetchShader *DrawEngineGX2::SetupFetchShaderForDraw(GX2VertexShader *vshader, const DecVtxFormat &decFmt, u32 pspFmt) { + // TODO: Instead of one for each vshader, we can reduce it to one for each type of shader + // that reads TEXCOORD or not, etc. Not sure if worth it. + FetchShaderKey key{ vshader, decFmt.id }; + GX2FetchShader *fetchShader = fetchShaderMap_.Get(key); + if (fetchShader) { + return fetchShader; + } + + GX2AttribStream VertexElements[8]; + GX2AttribStream *VertexElement = &VertexElements[0]; + + // POSITION + // Always + VertexAttribSetup(VertexElement, decFmt.posfmt, decFmt.posoff, (u32)GX2Gen::VSInput::POSITION); + VertexElement++; + + // TC + if (decFmt.uvfmt != 0) { + VertexAttribSetup(VertexElement, decFmt.uvfmt, decFmt.uvoff, (u32)GX2Gen::VSInput::COORDS); + VertexElement++; + } + + // COLOR + if (decFmt.c0fmt != 0) { + VertexAttribSetup(VertexElement, decFmt.c0fmt, decFmt.c0off, (u32)GX2Gen::VSInput::COLOR0); + VertexElement++; + } + + // Never used ? + if (decFmt.c1fmt != 0) { + VertexAttribSetup(VertexElement, decFmt.c1fmt, decFmt.c1off, (u32)GX2Gen::VSInput::COLOR1); + VertexElement++; + } + + // NORMAL + if (decFmt.nrmfmt != 0) { + VertexAttribSetup(VertexElement, decFmt.nrmfmt, decFmt.nrmoff, (u32)GX2Gen::VSInput::NORMAL); + VertexElement++; + } + + // WEIGHT + if (decFmt.w0fmt != 0) { + VertexAttribSetup(VertexElement, decFmt.w0fmt, decFmt.w0off, (u32)GX2Gen::VSInput::WEIGHT0); + VertexElement++; + } + + if (decFmt.w1fmt != 0) { + VertexAttribSetup(VertexElement, decFmt.w1fmt, decFmt.w1off, (u32)GX2Gen::VSInput::WEIGHT1); + VertexElement++; + } + + // Create fetchShader + fetchShader = new GX2FetchShader; + fetchShader->size = GX2CalcFetchShaderSize(VertexElement - VertexElements); + fetchShader->program = (u8 *)MEM2_alloc(fetchShader->size, GX2_SHADER_ALIGNMENT); + GX2InitFetchShader(fetchShader, fetchShader->program, VertexElement - VertexElements, VertexElements); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, fetchShader->program, fetchShader->size); + + // Add it to map + fetchShaderMap_.Insert(key, fetchShader); + return fetchShader; +} + +void DrawEngineGX2::MarkUnreliable(VertexArrayInfoGX2 *vai) { + vai->status = VertexArrayInfoGX2::VAI_UNRELIABLE; + + MEM2_free(vai->vbo); + vai->vbo = nullptr; + + MEM2_free(vai->ebo); + vai->ebo = nullptr; +} + +void DrawEngineGX2::BeginFrame() { + pushVerts_->Reset(); + pushInds_->Reset(); + pushUBO_->Reset(); + + if (--decimationCounter_ <= 0) { + decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; + } else { + return; + } + + const int threshold = gpuStats.numFlips - VAI_KILL_AGE; + const int unreliableThreshold = gpuStats.numFlips - VAI_UNRELIABLE_KILL_AGE; + int unreliableLeft = VAI_UNRELIABLE_KILL_MAX; + vai_.Iterate([&](u32 hash, VertexArrayInfoGX2 *vai) { + bool kill; + if (vai->status == VertexArrayInfoGX2::VAI_UNRELIABLE) { + // We limit killing unreliable so we don't rehash too often. + kill = vai->lastFrame < unreliableThreshold && --unreliableLeft >= 0; + } else { + kill = vai->lastFrame < threshold; + } + if (kill) { + delete vai; + vai_.Remove(hash); + } + }); + vai_.Maintain(); + + // Enable if you want to see vertex decoders in the log output. Need a better way. +#if 0 + char buffer[16384]; + for (std::map::iterator dec = decoderMap_.begin(); dec != decoderMap_.end(); ++dec) { + char *ptr = buffer; + ptr += dec->second->ToString(ptr); + // *ptr++ = '\n'; + NOTICE_LOG(G3D, buffer); + } +#endif + + lastRenderStepId_ = -1; +} + +VertexArrayInfoGX2::~VertexArrayInfoGX2() { + MEM2_free(vbo); + MEM2_free(ebo); +} + +static u32 SwapRB(u32 c) { return (c & 0xFF00FF00) | ((c >> 16) & 0xFF) | ((c << 16) & 0xFF0000); } + +// The inline wrapper in the header checks for numDrawCalls == 0 +void DrawEngineGX2::DoFlush() { + PROFILE_THIS_SCOPE("Flush"); + gpuStats.numFlushes++; + gpuStats.numTrackedVertexArrays = (int)vai_.size(); + + // In GX2, we're synchronous and state carries over so all we reset here on a new step is the viewport/scissor. + int curRenderStepId = draw_->GetCurrentStepId(); + if (lastRenderStepId_ != curRenderStepId) { + // Dirty everything that has dynamic state that will need re-recording. + gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE); + lastRenderStepId_ = curRenderStepId; + } + + // This is not done on every drawcall, we collect vertex data + // until critical state changes. That's when we draw (flush). + + GEPrimitiveType prim = prevPrim_; + ApplyDrawState(prim); + + // Always use software for flat shading to fix the provoking index. + bool tess = gstate_c.bezier || gstate_c.spline; + bool useHWTransform = CanUseHardwareTransform(prim); + + if (useHWTransform) { + PROFILE_THIS_SCOPE("T-hard"); + void *vb_ = nullptr; + void *ib_ = nullptr; + + int vertexCount = 0; + int maxIndex = 0; + bool useElements = true; + + // Cannot cache vertex data with morph enabled. + bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK); + // Also avoid caching when software skinning. + if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) + useCache = false; + + if (useCache) { + PROFILE_THIS_SCOPE("vcache"); + u32 id = dcid_ ^ gstate.getUVGenMode(); // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263 + + VertexArrayInfoGX2 *vai = vai_.Get(id); + if (!vai) { + vai = new VertexArrayInfoGX2(); + vai_.Insert(id, vai); + } + + switch (vai->status) { + case VertexArrayInfoGX2::VAI_NEW: { + // Haven't seen this one before. + uint64_t dataHash = ComputeHash(); + vai->hash = dataHash; + vai->minihash = ComputeMiniHash(); + vai->status = VertexArrayInfoGX2::VAI_HASHING; + vai->drawsUntilNextFullHash = 0; + DecodeVerts(decoded); // writes to indexGen + vai->numVerts = indexGen.VertexCount(); + vai->prim = indexGen.Prim(); + vai->maxIndex = indexGen.MaxIndex(); + vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; + goto rotateVBO; + } + + // Hashing - still gaining confidence about the buffer. + // But if we get this far it's likely to be worth creating a vertex buffer. + case VertexArrayInfoGX2::VAI_HASHING: { + PROFILE_THIS_SCOPE("vcachehash"); + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + if (vai->drawsUntilNextFullHash == 0) { + // Let's try to skip a full hash if mini would fail. + const u32 newMiniHash = ComputeMiniHash(); + uint64_t newHash = vai->hash; + if (newMiniHash == vai->minihash) { + newHash = ComputeHash(); + } + if (newMiniHash != vai->minihash || newHash != vai->hash) { + MarkUnreliable(vai); + DecodeVerts(decoded); + goto rotateVBO; + } + if (vai->numVerts > 64) { + // exponential backoff up to 16 draws, then every 24 + vai->drawsUntilNextFullHash = std::min(24, vai->numFrames); + } else { + // Lower numbers seem much more likely to change. + vai->drawsUntilNextFullHash = 0; + } + // TODO: tweak + // if (vai->numFrames > 1000) { + // vai->status = VertexArrayInfo::VAI_RELIABLE; + //} + } else { + vai->drawsUntilNextFullHash--; + u32 newMiniHash = ComputeMiniHash(); + if (newMiniHash != vai->minihash) { + MarkUnreliable(vai); + DecodeVerts(decoded); + goto rotateVBO; + } + } + + if (vai->vbo == 0) { + DecodeVerts(decoded); + vai->numVerts = indexGen.VertexCount(); + vai->prim = indexGen.Prim(); + vai->maxIndex = indexGen.MaxIndex(); + vai->flags = gstate_c.vertexFullAlpha ? VAI11_FLAG_VERTEXFULLALPHA : 0; + useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; + if (!useElements && indexGen.PureCount()) { + vai->numVerts = indexGen.PureCount(); + } + + _dbg_assert_msg_(gstate_c.vertBounds.minV >= gstate_c.vertBounds.maxV, "Should not have checked UVs when caching."); + + // TODO: Combine these two into one buffer? + u32 size = dec_->GetDecVtxFmt().stride * indexGen.MaxIndex(); + vai->vbo = MEM2_alloc(size, GX2_VERTEX_BUFFER_ALIGNMENT); + memcpy(vai->vbo, decoded, size); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER, vai->vbo, size); + if (useElements) { + u32 size = sizeof(short) * indexGen.VertexCount(); + vai->ebo = MEM2_alloc(size, GX2_INDEX_BUFFER_ALIGNMENT); + memcpy(vai->ebo, decIndex, size); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER, vai->ebo, size); + } else { + vai->ebo = 0; + } + } else { + gpuStats.numCachedDrawCalls++; + useElements = vai->ebo ? true : false; + gpuStats.numCachedVertsDrawn += vai->numVerts; + gstate_c.vertexFullAlpha = vai->flags & VAI11_FLAG_VERTEXFULLALPHA; + } + vb_ = vai->vbo; + ib_ = vai->ebo; + vertexCount = vai->numVerts; + maxIndex = vai->maxIndex; + prim = static_cast(vai->prim); + break; + } + + // Reliable - we don't even bother hashing anymore. Right now we don't go here until after a very long time. + case VertexArrayInfoGX2::VAI_RELIABLE: { + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + gpuStats.numCachedDrawCalls++; + gpuStats.numCachedVertsDrawn += vai->numVerts; + vb_ = vai->vbo; + ib_ = vai->ebo; + + vertexCount = vai->numVerts; + + maxIndex = vai->maxIndex; + prim = static_cast(vai->prim); + + gstate_c.vertexFullAlpha = vai->flags & VAI11_FLAG_VERTEXFULLALPHA; + break; + } + + case VertexArrayInfoGX2::VAI_UNRELIABLE: { + vai->numDraws++; + if (vai->lastFrame != gpuStats.numFlips) { + vai->numFrames++; + } + DecodeVerts(decoded); + goto rotateVBO; + } + } + + vai->lastFrame = gpuStats.numFlips; + } else { + DecodeVerts(decoded); + rotateVBO: + gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); + useElements = !indexGen.SeenOnlyPurePrims() || prim == GE_PRIM_TRIANGLE_FAN; + vertexCount = indexGen.VertexCount(); + maxIndex = indexGen.MaxIndex(); + if (!useElements && indexGen.PureCount()) { + vertexCount = indexGen.PureCount(); + } + prim = indexGen.Prim(); + } + + VERBOSE_LOG(G3D, "Flush prim %i! %i verts in one go", prim, vertexCount); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; + if (gstate.isModeThrough()) { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); + } else { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); + } + + ApplyDrawStateLate(true, dynState_.stencilRef); + + GX2VShader *vshader; + GX2PShader *fshader; + shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, useHWTransform, useHWTessellation_); + GX2FetchShader *fetchShader = SetupFetchShaderForDraw(vshader, dec_->GetDecVtxFmt(), dec_->VertexType()); + GX2SetPixelShader(fshader); + GX2SetVertexShader(vshader); + shaderManager_->UpdateUniforms(pushUBO_, framebufferManager_->UseBufferedRendering()); + + GX2SetFetchShader(fetchShader); + u32 stride = dec_->GetDecVtxFmt().stride; + // GX2prim[prim]; + if (!vb_) { + // Push! + u32 vOffset; + int vSize = (maxIndex + 1) * dec_->GetDecVtxFmt().stride; + u8 *vptr = pushVerts_->BeginPush(&vOffset, vSize); + memcpy(vptr, decoded, vSize); + pushVerts_->EndPush(); + GX2SetAttribBuffer(0, vSize, stride, vptr); + if (useElements) { + u32 iOffset; + int iSize = 2 * indexGen.VertexCount(); + u8 *iptr = pushInds_->BeginPush(&iOffset, iSize); + memcpy(iptr, decIndex, iSize); + pushInds_->EndPush(); + GX2DrawIndexedEx(GX2prim[prim], vertexCount, GX2_INDEX_TYPE_U16, iptr, 0, 1); + } else { + GX2DrawEx(GX2prim[prim], vertexCount, 0, 1); + } + } else { + GX2SetAttribBuffer(0, vertexCount * stride, stride, vb_); + if (useElements) { + GX2DrawIndexedEx(GX2prim[prim], vertexCount, GX2_INDEX_TYPE_U16, ib_, 0, 1); + } else { + GX2DrawEx(GX2prim[prim], vertexCount, 0, 1); + } + } + } else { + PROFILE_THIS_SCOPE("T-soft"); + DecodeVerts(decoded); + bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE; + if (gstate.isModeThrough()) { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255); + } else { + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255); + } + + gpuStats.numUncachedVertsDrawn += indexGen.VertexCount(); + prim = indexGen.Prim(); + // Undo the strip optimization, not supported by the SW code yet. + if (prim == GE_PRIM_TRIANGLE_STRIP) + prim = GE_PRIM_TRIANGLES; + VERBOSE_LOG(G3D, "Flush prim %i SW! %i verts in one go", prim, indexGen.VertexCount()); + + u16 *inds = decIndex; + SoftwareTransformResult result{}; + SoftwareTransformParams params{}; + params.decoded = decoded; + params.transformed = transformed; + params.transformedExpanded = transformedExpanded; + params.fbman = framebufferManager_; + params.texCache = textureCache_; + params.allowClear = true; + params.allowSeparateAlphaClear = false; // GX2 doesn't support separate alpha clears + params.provokeFlatFirst = true; + + int maxIndex = indexGen.MaxIndex(); + SoftwareTransform swTransform(params); + swTransform.Decode(prim, dec_->VertexType(), dec_->GetDecVtxFmt(), maxIndex, &result); + if (result.action == SW_NOT_READY) { + swTransform.DetectOffsetTexture(maxIndex); + swTransform.BuildDrawingParams(prim, indexGen.VertexCount(), dec_->VertexType(), inds, maxIndex, &result); + } + + if (result.setSafeSize) + framebufferManager_->SetSafeSize(result.safeWidth, result.safeHeight); + + if (result.action == SW_DRAW_PRIMITIVES) { + ApplyDrawStateLate(result.setStencil, result.stencilValue); + + GX2VShader *vshader; + GX2PShader *fshader; + shaderManager_->GetShaders(prim, lastVType_, &vshader, &fshader, false, false); + GX2SetPixelShader(fshader); + GX2SetVertexShader(vshader); + shaderManager_->UpdateUniforms(pushUBO_, framebufferManager_->UseBufferedRendering()); + + // We really do need a vertex layout for each vertex shader (or at least check its ID bits for what inputs it uses)! + // Some vertex shaders ignore one of the inputs, and then the layout created from it will lack it, which will be a problem for others. + FetchShaderKey key{ vshader, 0xFFFFFFFF }; // Let's use 0xFFFFFFFF to signify TransformedVertex + GX2FetchShader *fetchShader = fetchShaderMap_.Get(key); + if (!fetchShader) { + fetchShader = new GX2FetchShader; + fetchShader->size = GX2CalcFetchShaderSize(ARRAY_SIZE(TransformedVertexElements)); + fetchShader->program = (u8 *)MEM2_alloc(fetchShader->size, GX2_SHADER_ALIGNMENT); + GX2InitFetchShader(fetchShader, fetchShader->program, ARRAY_SIZE(TransformedVertexElements), TransformedVertexElements); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, fetchShader->program, fetchShader->size); + fetchShaderMap_.Insert(key, fetchShader); + } + GX2SetFetchShader(fetchShader); + + u32 stride = sizeof(TransformedVertex); + u32 vOffset = 0; + int vSize = maxIndex * stride; + u8 *vptr = pushVerts_->BeginPush(&vOffset, vSize); + memcpy(vptr, result.drawBuffer, vSize); + pushVerts_->EndPush(); + GX2SetAttribBuffer(0, vSize, stride, vptr); + if (result.drawIndexed) { + u32 iOffset; + int iSize = sizeof(u16) * result.drawNumTrans; + u8 *iptr = pushInds_->BeginPush(&iOffset, iSize); + memcpy(iptr, inds, iSize); + pushInds_->EndPush(); + GX2DrawIndexedEx(GX2prim[prim], result.drawNumTrans, GX2_INDEX_TYPE_U16, iptr, 0, 1); + } else { + GX2DrawEx(GX2prim[prim], result.drawNumTrans, 0, 1); + } + } else if (result.action == SW_CLEAR) { + u32 clearColor = result.color; + float clearDepth = result.depth; + + u32 clearFlag = 0; + + if (gstate.isClearModeColorMask()) + clearFlag |= Draw::FBChannel::FB_COLOR_BIT; + if (gstate.isClearModeAlphaMask()) + clearFlag |= Draw::FBChannel::FB_STENCIL_BIT; + if (gstate.isClearModeDepthMask()) + clearFlag |= Draw::FBChannel::FB_DEPTH_BIT; + + if (clearFlag & Draw::FBChannel::FB_DEPTH_BIT) { + framebufferManager_->SetDepthUpdated(); + } + if (clearFlag & Draw::FBChannel::FB_COLOR_BIT) { + framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); + } + + u8 clearStencil = clearColor >> 24; + draw_->Clear(clearFlag, clearColor, clearDepth, clearStencil); + + if ((gstate_c.featureFlags & GPU_USE_CLEAR_RAM_HACK) && gstate.isClearModeColorMask() && (gstate.isClearModeAlphaMask() || gstate.FrameBufFormat() == GE_FORMAT_565)) { + int scissorX1 = gstate.getScissorX1(); + int scissorY1 = gstate.getScissorY1(); + int scissorX2 = gstate.getScissorX2() + 1; + int scissorY2 = gstate.getScissorY2() + 1; + framebufferManager_->ApplyClearToMemory(scissorX1, scissorY1, scissorX2, scissorY2, clearColor); + } + } + } + + gpuStats.numDrawCalls += numDrawCalls; + gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; + + indexGen.Reset(); + decodedVerts_ = 0; + numDrawCalls = 0; + vertexCountInDrawCalls_ = 0; + decodeCounter_ = 0; + dcid_ = 0; + prevPrim_ = GE_PRIM_INVALID; + gstate_c.vertexFullAlpha = true; + framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason); + + // Now seems as good a time as any to reset the min/max coords, which we may examine later. + gstate_c.vertBounds.minU = 512; + gstate_c.vertBounds.minV = 512; + gstate_c.vertBounds.maxU = 0; + gstate_c.vertBounds.maxV = 0; + + { + PROFILE_THIS_SCOPE("GX2Flush"); + GX2Flush(); + } +#if 0 + // We only support GPU debugging on Windows, and that's the only use case for this. + GPUDebug::NotifyDraw(); +#endif +} + +void DrawEngineGX2::TessellationDataTransferGX2::SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) { +#if 0 + // Position + if (prevSize < size) { + prevSize = size; + MEM2_free(data_tex[0].surface.image); + data_tex[0].surface.width = size; + data_tex[0].surface.height = 1; + data_tex[0].surface.depth = 1; + data_tex[0].surface.dim = GX2_SURFACE_DIM_TEXTURE_1D; + data_tex[0].surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + data_tex[0].surface.use = GX2_SURFACE_USE_TEXTURE; + data_tex[0].viewNumSlices = 1; + data_tex[0].surface.format = GX2_SURFACE_FORMAT_FLOAT_R32_G32_B32_A32; + data_tex[0].compMap = GX2_COMP_SEL(_a, _r, _g, _b); + GX2CalcSurfaceSizeAndAlignment(&data_tex[0].surface); + GX2InitTextureRegs(&data_tex[0]); + data_tex[0].surface.image = MEM2_alloc(data_tex[0].surface.imageSize, data_tex[0].surface.alignment); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, data_tex[0].surface.image, data_tex[0].surface.imageSize); + + if (!data_tex[0].surface.image) { + INFO_LOG(G3D, "Failed to create GX2 texture for HW tessellation"); + return; // TODO: Turn off HW tessellation if texture creation error occured. + } + GX2SetVertexTexture(&data_tex[0], 0); + } + const u32 *src = (const u32 *)pos; + u32 *dst = (u32 *)data_tex[0].surface.image; + while (src < (u32 *)pos + size) { + *dst++ = __builtin_bswap32(*src++); + } + + // Texcoords + if (hasTexCoords) { + if (prevSizeTex < size) { + prevSizeTex = size; + MEM2_free(data_tex[1].surface.image); + data_tex[1].surface.width = size; + data_tex[1].surface.height = 1; + data_tex[1].surface.depth = 1; + data_tex[1].surface.dim = GX2_SURFACE_DIM_TEXTURE_1D; + data_tex[1].surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + data_tex[1].surface.use = GX2_SURFACE_USE_TEXTURE; + data_tex[1].viewNumSlices = 1; + data_tex[1].surface.format = GX2_SURFACE_FORMAT_FLOAT_R32_G32_B32_A32; + data_tex[1].compMap = GX2_COMP_SEL(_a, _r, _g, _b); + GX2CalcSurfaceSizeAndAlignment(&data_tex[1].surface); + GX2InitTextureRegs(&data_tex[1]); + data_tex[1].surface.image = MEM2_alloc(data_tex[1].surface.imageSize, data_tex[1].surface.alignment); + if (!data_tex[1].surface.image) { + INFO_LOG(G3D, "Failed to create GX2 texture for HW tessellation"); + return; // TODO: Turn off HW tessellation if texture creation error occured. + } + GX2SetVertexTexture(&data_tex[1], 1); + } + src = (const u32 *)pos; + dst = (u32 *)data_tex[1].surface.image; + while (src < (u32 *)pos + size) { + *dst++ = __builtin_bswap32(*src++); + } + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, data_tex[1].surface.image, data_tex[1].surface.imageSize); + } + + // Color + int sizeColor = hasColor ? size : 1; + if (prevSizeCol < sizeColor) { + prevSizeCol = sizeColor; + MEM2_free(data_tex[2].surface.image); + data_tex[2].surface.width = sizeColor; + data_tex[2].surface.height = 1; + data_tex[2].surface.depth = 1; + data_tex[2].surface.dim = GX2_SURFACE_DIM_TEXTURE_1D; + data_tex[2].surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + data_tex[2].surface.use = GX2_SURFACE_USE_TEXTURE; + data_tex[2].viewNumSlices = 1; + data_tex[2].surface.format = GX2_SURFACE_FORMAT_FLOAT_R32_G32_B32_A32; + data_tex[2].compMap = GX2_COMP_SEL(_a, _r, _g, _b); + GX2CalcSurfaceSizeAndAlignment(&data_tex[2].surface); + GX2InitTextureRegs(&data_tex[2]); + data_tex[2].surface.image = MEM2_alloc(data_tex[2].surface.imageSize, data_tex[2].surface.alignment); + if (!data_tex[2].surface.image) { + INFO_LOG(G3D, "Failed to create GX2 texture for HW tessellation"); + return; // TODO: Turn off HW tessellation if texture creation error occured. + } + GX2SetVertexTexture(&data_tex[2], 2); + } + src = (const u32 *)col; + dst = (u32 *)data_tex[2].surface.image; + while (src < (u32 *)pos + sizeColor) { + *dst++ = __builtin_bswap32(*src++); + } + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, data_tex[2].surface.image, data_tex[2].surface.imageSize); +#endif +} diff --git a/GPU/GX2/DrawEngineGX2.h b/GPU/GX2/DrawEngineGX2.h new file mode 100644 index 000000000000..0e1f542a8cba --- /dev/null +++ b/GPU/GX2/DrawEngineGX2.h @@ -0,0 +1,221 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include + +#include "Common/Hashmaps.h" +#include "GPU/GPUState.h" +#include "GPU/Common/GPUDebugInterface.h" +#include "GPU/Common/IndexGenerator.h" +#include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/DrawEngineCommon.h" +#include "GPU/Common/GPUStateUtils.h" +#include "GPU/GX2/FragmentShaderGeneratorGX2.h" +#include "GPU/GX2/StateMappingGX2.h" +#include "GPU/GX2/GX2Util.h" + +struct DecVtxFormat; +struct UVScale; + +class GX2VertexShader; +class ShaderManagerGX2; +class TextureCacheGX2; +class FramebufferManagerGX2; + +// States transitions: +// On creation: DRAWN_NEW +// DRAWN_NEW -> DRAWN_HASHING +// DRAWN_HASHING -> DRAWN_RELIABLE +// DRAWN_HASHING -> DRAWN_UNRELIABLE +// DRAWN_ONCE -> UNRELIABLE +// DRAWN_RELIABLE -> DRAWN_SAFE +// UNRELIABLE -> death +// DRAWN_ONCE -> death +// DRAWN_RELIABLE -> death + +enum { + VAI11_FLAG_VERTEXFULLALPHA = 1, +}; + +// Try to keep this POD. +class VertexArrayInfoGX2 { +public: + VertexArrayInfoGX2() { + status = VAI_NEW; + vbo = nullptr; + ebo = nullptr; + prim = GE_PRIM_INVALID; + numDraws = 0; + numFrames = 0; + lastFrame = gpuStats.numFlips; + numVerts = 0; + drawsUntilNextFullHash = 0; + flags = 0; + } + ~VertexArrayInfoGX2(); + + enum Status : u8 { + VAI_NEW, + VAI_HASHING, + VAI_RELIABLE, // cache, don't hash + VAI_UNRELIABLE, // never cache + }; + + uint64_t hash; + u32 minihash; + + void *vbo; + void *ebo; + + // Precalculated parameter for drawRangeElements + u16 numVerts; + u16 maxIndex; + s8 prim; + Status status; + + // ID information + int numDraws; + int numFrames; + int lastFrame; // So that we can forget. + u16 drawsUntilNextFullHash; + u8 flags; +}; + +// Handles transform, lighting and drawing. +class DrawEngineGX2 : public DrawEngineCommon { +public: + DrawEngineGX2(Draw::DrawContext *draw, GX2ContextState *context); + virtual ~DrawEngineGX2(); + + void SetShaderManager(ShaderManagerGX2 *shaderManager) { shaderManager_ = shaderManager; } + void SetTextureCache(TextureCacheGX2 *textureCache) { textureCache_ = textureCache; } + void SetFramebufferManager(FramebufferManagerGX2 *fbManager) { framebufferManager_ = fbManager; } + void InitDeviceObjects(); + void DestroyDeviceObjects(); + + void BeginFrame(); + + // So that this can be inlined + void Flush() { + if (!numDrawCalls) + return; + DoFlush(); + } + + void FinishDeferred() { + if (!numDrawCalls) + return; + DecodeVerts(decoded); + } + + void DispatchFlush() override { Flush(); } + + void ClearTrackedVertexArrays() override; + + void Resized() override; + + void ClearInputLayoutMap(); + +private: + void DoFlush(); + + void ApplyDrawState(int prim); + void ApplyDrawStateLate(bool applyStencilRef, u8 stencilRef); + void ResetShaderBlending(); + + GX2FetchShader *SetupFetchShaderForDraw(GX2VertexShader *vshader, const DecVtxFormat &decFmt, u32 pspFmt); + + void MarkUnreliable(VertexArrayInfoGX2 *vai); + + Draw::DrawContext *draw_; // Used for framebuffer related things exclusively. + GX2ContextState *context_; + + PrehashMap vai_; + + struct FetchShaderKey { + GX2VertexShader *vshader; + u32 decFmtId; + bool operator<(const FetchShaderKey &other) const { + if (decFmtId < other.decFmtId) + return true; + if (decFmtId > other.decFmtId) + return false; + return vshader < other.vshader; + } + }; + + DenseHashMap fetchShaderMap_; + + // Other + ShaderManagerGX2 *shaderManager_ = nullptr; + TextureCacheGX2 *textureCache_ = nullptr; + FramebufferManagerGX2 *framebufferManager_ = nullptr; + + // Pushbuffers + PushBufferGX2 *pushVerts_; + PushBufferGX2 *pushInds_; + PushBufferGX2 *pushUBO_; + + // GX2 state object caches. + + struct GX2BlendState { + GX2ColorControlReg color; + GX2BlendControlReg blend; + GX2BlendConstantColorReg constant; + GX2TargetChannelMaskReg mask; + }; + + struct GX2RasterizerState { + GX2FrontFace frontFace_; + BOOL cullFront_; + BOOL cullBack_; + }; + + DenseHashMap blendCache_; + DenseHashMap depthStencilCache_; + DenseHashMap rasterCache_; + + // Keep the depth state between ApplyDrawState and ApplyDrawStateLate + GX2BlendState* blendState_ = nullptr; + GX2DepthStencilControlReg* depthStencilState_ = nullptr; + GX2RasterizerState* rasterState_ = nullptr; + + // State keys + GX2StateKeys keys_{}; + GX2DynamicState dynState_{}; + + // Hardware tessellation + class TessellationDataTransferGX2 : public TessellationDataTransfer { + private: + GX2ContextState *context_; + GX2Texture data_tex[3]; + + public: + TessellationDataTransferGX2(GX2ContextState *context_) : TessellationDataTransfer(), context_(context_), data_tex{} {} + ~TessellationDataTransferGX2() { + for (int i = 0; i < 3; i++) { + MEM2_free(data_tex[i].surface.image); + } + } + void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) override; + }; + + int lastRenderStepId_ = -1; +}; diff --git a/GPU/GX2/FragmentShaderGeneratorGX2.cpp b/GPU/GX2/FragmentShaderGeneratorGX2.cpp new file mode 100644 index 000000000000..1e48ff30e035 --- /dev/null +++ b/GPU/GX2/FragmentShaderGeneratorGX2.cpp @@ -0,0 +1,232 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "GPU/Common/ShaderCommon.h" +#include "GPU/Common/ShaderUniforms.h" +#include "GPU/Common/GPUStateUtils.h" +#include "GPU/ge_constants.h" + +#include "GPU/GX2/FragmentShaderGeneratorGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" + +#include +#include +#include + +#include "GPU/Vulkan/FragmentShaderGeneratorVulkan.h" +#include + +using namespace GX2Gen; + +class FragmentShaderGeneratorGX2 : private GX2PixelShaderEmitter { +public: + FragmentShaderGeneratorGX2() {} + bool Supported(const FShaderID &id); + void Emit(const FShaderID &id, GX2PixelShader *ps); +}; + +bool FragmentShaderGeneratorGX2::Supported(const FShaderID &id) { + FShaderID unsupported; + unsupported.SetBit(FS_BIT_SHADER_DEPAL); + unsupported.SetBit(FS_BIT_SHADER_TEX_CLAMP); + unsupported.SetBit(FS_BIT_CLAMP_S); + unsupported.SetBit(FS_BIT_CLAMP_T); + unsupported.SetBit(FS_BIT_TEXTURE_AT_OFFSET); + unsupported.SetBit(FS_BIT_LMODE); + unsupported.SetBit(FS_BIT_COLOR_TEST); + unsupported.SetBits(FS_BIT_COLOR_TEST_FUNC, 2, -1); + unsupported.SetBit(FS_BIT_COLOR_AGAINST_ZERO); + unsupported.SetBit(FS_BIT_ENABLE_FOG); + unsupported.SetBit(FS_BIT_DO_TEXTURE_PROJ); + unsupported.SetBit(FS_BIT_COLOR_DOUBLE); + // unsupported.SetBits(FS_BIT_STENCIL_TO_ALPHA, 2, -1); + // unsupported.SetBits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4, -1); + unsupported.SetBits(FS_BIT_REPLACE_LOGIC_OP_TYPE, 2, -1); + unsupported.SetBits(FS_BIT_REPLACE_BLEND, 3, -1); + unsupported.SetBits(FS_BIT_BLENDEQ, 3, -1); + unsupported.SetBits(FS_BIT_BLENDFUNC_A, 4, -1); + unsupported.SetBits(FS_BIT_BLENDFUNC_B, 4, -1); + unsupported.SetBit(FS_BIT_FLATSHADE); + unsupported.SetBit(FS_BIT_TEST_DISCARD_TO_ZERO); + unsupported.SetBit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL); + + return !(unsupported.d[0] & id.d[0]) && !(unsupported.d[1] & id.d[1]); +} + +void FragmentShaderGeneratorGX2::Emit(const FShaderID &id, GX2PixelShader *ps) { + GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3); + // GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2); + GETexFunc texFunc = (GETexFunc)id.Bits(FS_BIT_TEXFUNC, 3); + // ReplaceBlendType replaceBlend = (ReplaceBlendType)(id.Bits(FS_BIT_REPLACE_BLEND, 3)); + ReplaceAlphaType stencilToAlpha = (ReplaceAlphaType)(id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2)); + // GEBlendSrcFactor replaceBlendFuncA = (GEBlendSrcFactor)id.Bits(FS_BIT_BLENDFUNC_A, 4); + // GEBlendDstFactor replaceBlendFuncB = (GEBlendDstFactor)id.Bits(FS_BIT_BLENDFUNC_B, 4); + // GEBlendMode replaceBlendEq = (GEBlendMode)id.Bits(FS_BIT_BLENDEQ, 3); + StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4); + + Reg color = allocImportReg(PSInput::COLOR0); + + if (!id.Bit(FS_BIT_CLEARMODE)) { + if (id.Bit(FS_BIT_DO_TEXTURE)) { + Reg coords = allocImportReg(PSInput::COORDS); + Reg sample = coords; + SAMPLE(sample, coords(x, y, _0_, _0_), 0, 0); + // if (id.Bit(FS_BIT_BGRA_TEXTURE)) + // sample = sample(b, g, r, a); + + switch (texFunc) { + case GE_TEXFUNC_REPLACE: + if (id.Bit(FS_BIT_TEXALPHA)) + color = sample; + else { + MOV(color(r), sample(r)); + MOV(color(g), sample(g)); + MOV(color(b), sample(b)); + ALU_LAST(); + } + break; + case GE_TEXFUNC_DECAL: + // TODO + ADD(color(r), color(r), sample(r)); + ADD(color(g), color(g), sample(g)); + ADD(color(b), color(b), sample(b)); + if (id.Bit(FS_BIT_TEXALPHA)) + ADD(color(a), color(a), sample(a)); + ALU_LAST(); + break; + case GE_TEXFUNC_MODULATE: + MUL(color(r), color(r), sample(r)); + MUL(color(g), color(g), sample(g)); + MUL(color(b), color(b), sample(b)); + if (id.Bit(FS_BIT_TEXALPHA)) + MUL(color(a), color(a), sample(a)); + ALU_LAST(); + break; + default: + case GE_TEXFUNC_ADD: + ADD(color(r), color(r), sample(r)); + ADD(color(g), color(g), sample(g)); + ADD(color(b), color(b), sample(b)); + if (id.Bit(FS_BIT_TEXALPHA)) + ADD(color(a), color(a), sample(a)); + ALU_LAST(); + break; + } + } + + if (id.Bit(FS_BIT_ALPHA_TEST)) { + if (id.Bit(FS_BIT_ALPHA_AGAINST_ZERO)) { + if (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) + KILLGT(x, C(0.002f), color(a)); + else if (alphaTestFunc != GE_COMP_NEVER) + KILLGT(x, color(a), C(0.002f)); + else + KILLE(x, color(a), color(a)); + } else { + // TODO + } + ALU_LAST(); + } + } + + SrcChannel replacedAlpha = C(0.0f); + if (stencilToAlpha != REPLACE_ALPHA_NO) { + switch (replaceAlphaWithStencilType) { + case STENCIL_VALUE_UNIFORM: { + replacedAlpha = KCacheChannel(UB_Bindings::Base, offsetof(UB_VS_FS_Base, stencil), this); + break; + } + + case STENCIL_VALUE_ZERO: replacedAlpha = C(0.0f); break; + + case STENCIL_VALUE_ONE: + case STENCIL_VALUE_INVERT: + // In invert, we subtract by one, but we want to output one here. + replacedAlpha = C(1.0f); + break; + + case STENCIL_VALUE_INCR_4: + case STENCIL_VALUE_DECR_4: + // We're adding/subtracting, just by the smallest value in 4-bit. + replacedAlpha = C(1.0f / 15.0f); + break; + + case STENCIL_VALUE_INCR_8: + case STENCIL_VALUE_DECR_8: + // We're adding/subtracting, just by the smallest value in 8-bit. + replacedAlpha = C(1.0f / 255.0f); + break; + + case STENCIL_VALUE_KEEP: + // Do nothing. We'll mask out the alpha using color mask. + break; + } + } + + switch (stencilToAlpha) { + case REPLACE_ALPHA_DUALSOURCE: { + Reg temp = allocReg(); + MOV(temp(r), color(r)); + MOV(temp(g), color(g)); + MOV(temp(b), color(b)); + MOV(temp(a), replacedAlpha); + ALU_LAST(); + EXP(PIX0, temp); + EXP(PIX1, color(_0_, _0_, _0_, a)); + break; + } + + case REPLACE_ALPHA_YES: + MOV(color(a), replacedAlpha); + ALU_LAST(); + EXP(PIX0, color); + break; + + case REPLACE_ALPHA_NO: EXP(PIX0, color); break; + + default: + ERROR_LOG(G3D, "Bad stencil-to-alpha type, corrupt ID?"); + EXP(PIX0, color); + break; + } + + END_OF_PROGRAM(ps); +} + +void GenerateFragmentShaderGX2(const FShaderID &id, GX2PixelShader *ps) { + FragmentShaderGeneratorGX2 fsGen; + if (fsGen.Supported(id)) { + fsGen.Emit(id, ps); +#if 0 + char buffer[0x20000]; + printf("\n### GPU Regs ###\n"); + GX2PixelShaderInfo(ps, buffer); + puts(buffer); + + printf("\n### ASM ###\n%s\n", FragmentShaderDesc(id).c_str()); + DisassembleGX2Shader(ps->program, ps->size, buffer); + puts(buffer); + + printf("\n### glsl ###\n"); + GenerateVulkanGLSLFragmentShader(id, buffer, 0); + puts(buffer); +#endif + } else { + WARN_LOG(G3D, "unsupported FShaderID: \"%s\"", FragmentShaderDesc(id).c_str()); + *ps = PShaderAllGX2; + } +} diff --git a/GPU/GX2/FragmentShaderGeneratorGX2.h b/GPU/GX2/FragmentShaderGeneratorGX2.h new file mode 100644 index 000000000000..71338fd54375 --- /dev/null +++ b/GPU/GX2/FragmentShaderGeneratorGX2.h @@ -0,0 +1,24 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include + +#include "GPU/Common/ShaderId.h" + +void GenerateFragmentShaderGX2(const FShaderID &id, GX2PixelShader *ps); diff --git a/GPU/GX2/FramebufferManagerGX2.cpp b/GPU/GX2/FramebufferManagerGX2.cpp new file mode 100644 index 000000000000..9505998f99e6 --- /dev/null +++ b/GPU/GX2/FramebufferManagerGX2.cpp @@ -0,0 +1,386 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "profiler/profiler.h" +#include "base/display.h" +#include "math/lin/matrix4x4.h" +#include "ext/native/thin3d/thin3d.h" +#include "base/basictypes.h" +#include "file/vfs.h" +#include "file/zip_read.h" +#include "i18n/i18n.h" + +#include "Common/ColorConv.h" +#include "Common/MathUtil.h" +#include "Core/Host.h" +#include "Core/MemMap.h" +#include "Core/Config.h" +#include "Core/System.h" +#include "Core/Reporting.h" +#include "GPU/ge_constants.h" +#include "GPU/GPUState.h" +#include "GPU/Debugger/Stepping.h" + +#include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/Common/PresentationCommon.h" +#include "GPU/Common/ShaderTranslation.h" +#include "GPU/Common/TextureDecoder.h" +#include "GPU/Common/PostShader.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/DrawEngineGX2.h" +#include "GPU/GX2/GX2Shaders.h" + +#include "ext/native/thin3d/thin3d.h" + +#include +#include + +// clang-format off +const GX2AttribStream FramebufferManagerGX2::g_QuadAttribStream[2] = { + { 0, 0, 0, GX2_ATTRIB_FORMAT_FLOAT_32_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _z, _1), GX2_ENDIAN_SWAP_DEFAULT }, + { 1, 0, 12, GX2_ATTRIB_FORMAT_FLOAT_32_32, GX2_ATTRIB_INDEX_PER_VERTEX, 0, GX2_COMP_SEL(_x, _y, _0, _0), GX2_ENDIAN_SWAP_DEFAULT }, +}; + +// STRIP geometry +__attribute__((aligned(GX2_VERTEX_BUFFER_ALIGNMENT))) +float FramebufferManagerGX2::fsQuadBuffer_[20] = { + -1.0f,-1.0f, 0.0f, 0.0f, 0.0f, + 1.0f,-1.0f, 0.0f, 1.0f, 0.0f, + -1.0f, 1.0f, 0.0f, 0.0f, 1.0f, + 1.0f, 1.0f, 0.0f, 1.0f, 1.0f, +}; +// clang-format on + +FramebufferManagerGX2::FramebufferManagerGX2(Draw::DrawContext *draw) + : FramebufferManagerCommon(draw) { + context_ = (GX2ContextState *)draw->GetNativeObject(Draw::NativeObject::CONTEXT); + + quadFetchShader_.size = GX2CalcFetchShaderSize(ARRAY_SIZE(g_QuadAttribStream)); + quadFetchShader_.program = (u8 *)MEM2_alloc(quadFetchShader_.size, GX2_SHADER_ALIGNMENT); + GX2InitFetchShader(&quadFetchShader_, quadFetchShader_.program, ARRAY_SIZE(g_QuadAttribStream), g_QuadAttribStream); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, quadFetchShader_.program, quadFetchShader_.size); + + quadBuffer_ = (float*)MEM2_alloc(quadStride_ * 4, GX2_VERTEX_BUFFER_ALIGNMENT); + + for (int i = 0; i < 256; i++) { + GX2InitStencilMaskReg(&stencilMaskStates_[i], i, 0xFF, 0xFF, i, 0xFF, 0xFF); + } + + ShaderTranslationInit(); + +// presentation_->SetLanguage(GLSL_300); +// preferredPixelsFormat_ = Draw::DataFormat::B8G8R8A8_UNORM; +} + +FramebufferManagerGX2::~FramebufferManagerGX2() { + ShaderTranslationShutdown(); + + // Drawing cleanup + MEM2_free(quadFetchShader_.program); + MEM2_free(quadBuffer_); + + // Stencil cleanup + if (stencilValueBuffer_) + MEM2_free(stencilValueBuffer_); +} + +void FramebufferManagerGX2::SetTextureCache(TextureCacheGX2 *tc) { + textureCacheGX2_ = tc; + textureCache_ = tc; +} + +void FramebufferManagerGX2::SetShaderManager(ShaderManagerGX2 *sm) { + shaderManagerGX2_ = sm; + shaderManager_ = sm; +} + +void FramebufferManagerGX2::SetDrawEngine(DrawEngineGX2 *td) { + drawEngineGX2_ = td; + drawEngine_ = td; +} + +void FramebufferManagerGX2::DrawActiveTexture(float x, float y, float w, float h, float destW, float destH, float u0, float v0, float u1, float v1, int uvRotation, int flags) { + struct Coord { + Lin::Vec3 pos; float u, v; + }; + Coord coord[4] = { + { { x, y, 0 }, u0, v0 }, + { { x + w, y, 0 }, u1, v0 }, + { { x + w, y + h, 0 }, u1, v1 }, + { { x, y + h, 0 }, u0, v1 }, + }; + + if (uvRotation != ROTATION_LOCKED_HORIZONTAL) { + float temp[8]; + int rotation = 0; + switch (uvRotation) { + case ROTATION_LOCKED_HORIZONTAL180: rotation = 2; break; + case ROTATION_LOCKED_VERTICAL: rotation = 1; break; + case ROTATION_LOCKED_VERTICAL180: rotation = 3; break; + } + for (int i = 0; i < 4; i++) { + temp[i * 2] = coord[((i + rotation) & 3)].u; + temp[i * 2 + 1] = coord[((i + rotation) & 3)].v; + } + + for (int i = 0; i < 4; i++) { + coord[i].u = temp[i * 2]; + coord[i].v = temp[i * 2 + 1]; + } + } + + float invDestW = 1.0f / (destW * 0.5f); + float invDestH = 1.0f / (destH * 0.5f); + for (int i = 0; i < 4; i++) { + coord[i].pos.x = coord[i].pos.x * invDestW - 1.0f; + coord[i].pos.y = -(coord[i].pos.y * invDestH - 1.0f); + } + + if (g_display_rotation != DisplayRotation::ROTATE_0) { + for (int i = 0; i < 4; i++) { + // backwards notation, should fix that... + coord[i].pos = coord[i].pos * g_display_rot_matrix; + } + } + + // The above code is for FAN geometry but we can only do STRIP. So rearrange it a little. + memcpy(quadBuffer_, coord, sizeof(Coord)); + memcpy(quadBuffer_ + 5, coord + 1, sizeof(Coord)); + memcpy(quadBuffer_ + 10, coord + 3, sizeof(Coord)); + memcpy(quadBuffer_ + 15, coord + 2, sizeof(Coord)); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER, quadBuffer_, sizeof(coord)); + + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetColorControlReg(&StockGX2::blendDisabledColorWrite); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0xF]); + GX2SetDepthStencilControlReg(&StockGX2::depthStencilDisabled); + GX2SetPixelSampler((flags & DRAWTEX_LINEAR) ? &StockGX2::samplerLinear2DClamp : &StockGX2::samplerPoint2DClamp, 0); + GX2SetAttribBuffer(0, sizeof(coord), sizeof(*coord), quadBuffer_); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE); +} + +void FramebufferManagerGX2::Bind2DShader() { + GX2SetFetchShader(&quadFetchShader_); + GX2SetPixelShader(&defPShaderGX2); + GX2SetVertexShader(&defVShaderGX2); +} + +void FramebufferManagerGX2::ReformatFramebufferFrom(VirtualFramebuffer *vfb, GEBufferFormat old) { + if (!useBufferedRendering_ || !vfb->fbo) { + return; + } + + // Technically, we should at this point re-interpret the bytes of the old format to the new. + // That might get tricky, and could cause unnecessary slowness in some games. + // For now, we just clear alpha/stencil from 565, which fixes shadow issues in Kingdom Hearts. + // (it uses 565 to write zeros to the buffer, than 4444 to actually render the shadow.) + // + // The best way to do this may ultimately be to create a new FBO (combine with any resize?) + // and blit with a shader to that, then replace the FBO on vfb. Stencil would still be complex + // to exactly reproduce in 4444 and 8888 formats. + if (old == GE_FORMAT_565) { + draw_->BindFramebufferAsRenderTarget(vfb->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "ReformatFramebuffer"); + + // TODO: There's no way this does anything useful :( + GX2SetDepthStencilControlReg(&StockGX2::depthDisabledStencilWrite); + GX2SetStencilMask(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); // TODO, and maybe GX2SetStencilMaskReg? + GX2SetColorControlReg(&StockGX2::blendColorDisabled); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetFetchShader(&quadFetchShader_); + GX2SetPixelShader(&defPShaderGX2); + GX2SetVertexShader(&defVShaderGX2); + GX2SetAttribBuffer(0, sizeof(fsQuadBuffer_), quadStride_, fsQuadBuffer_); + GX2SetPixelSampler(&StockGX2::samplerPoint2DClamp, 0); +// GX2SetPixelTexture(nullptr, 0); + shaderManagerGX2_->DirtyLastShader(); + GX2SetViewport( 0.0f, 0.0f, (float)vfb->renderWidth, (float)vfb->renderHeight, 0.0f, 1.0f); + GX2SetScissor(0, 0, vfb->renderWidth, vfb->renderHeight); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + + textureCache_->ForgetLastTexture(); + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE); + } +} + +static void CopyPixelDepthOnly(u32 *dstp, const u32 *srcp, size_t c) { + for (size_t x = 0; x < c; ++x) { + memcpy(dstp + x, srcp + x, 3); + } +} + +void FramebufferManagerGX2::BlitFramebufferDepth(VirtualFramebuffer *src, VirtualFramebuffer *dst) { + bool matchingDepthBuffer = src->z_address == dst->z_address && src->z_stride != 0 && dst->z_stride != 0; + bool matchingSize = src->width == dst->width && src->height == dst->height; + bool matchingRenderSize = src->renderWidth == dst->renderWidth && src->renderHeight == dst->renderHeight; + if (matchingDepthBuffer && matchingSize && matchingRenderSize) { + // TODO: Currently, this copies depth AND stencil, which is a problem. See #9740. + draw_->CopyFramebufferImage(src->fbo, 0, 0, 0, 0, dst->fbo, 0, 0, 0, 0, src->renderWidth, src->renderHeight, 1, Draw::FB_DEPTH_BIT, "BlitFramebufferDepth"); + RebindFramebuffer("RebindFramebuffer - BlitFramebufferDepth"); + dst->last_frame_depth_updated = gpuStats.numFlips; + } +} + +void FramebufferManagerGX2::BindFramebufferAsColorTexture(int stage, VirtualFramebuffer *framebuffer, int flags) { + if (!framebuffer->fbo || !useBufferedRendering_) { + // GX2SetPixelTexture(nullptr, 1); // TODO: what is the correct way to unbind a texture ? + gstate_c.skipDrawReason |= SKIPDRAW_BAD_FB_TEXTURE; + return; + } + + // currentRenderVfb_ will always be set when this is called, except from the GE debugger. + // Let's just not bother with the copy in that case. + bool skipCopy = (flags & BINDFBCOLOR_MAY_COPY) == 0; + if (GPUStepping::IsStepping()) { + skipCopy = true; + } + // Currently rendering to this framebuffer. Need to make a copy. + if (!skipCopy && framebuffer == currentRenderVfb_) { + // TODO: Maybe merge with bvfbs_? Not sure if those could be packing, and they're created at a different size. + Draw::Framebuffer *renderCopy = GetTempFBO(TempFBO::COPY, framebuffer->renderWidth, framebuffer->renderHeight, (Draw::FBColorDepth)framebuffer->colorDepth); + if (renderCopy) { + VirtualFramebuffer copyInfo = *framebuffer; + copyInfo.fbo = renderCopy; + CopyFramebufferForColorTexture(©Info, framebuffer, flags); + RebindFramebuffer("RebindFramebuffer - BindFramebufferAsColorTexture"); + draw_->BindFramebufferAsTexture(renderCopy, stage, Draw::FB_COLOR_BIT, 0); + } else { + draw_->BindFramebufferAsTexture(framebuffer->fbo, stage, Draw::FB_COLOR_BIT, 0); + } + } else if (framebuffer != currentRenderVfb_) { + draw_->BindFramebufferAsTexture(framebuffer->fbo, stage, Draw::FB_COLOR_BIT, 0); + } else { + ERROR_LOG_REPORT_ONCE(GX2SelfTexture, G3D, "Attempting to texture from target (src=%08x / target=%08x / flags=%d)", framebuffer->fb_address, currentRenderVfb_->fb_address, flags); + // Badness on GX2 to bind the currently rendered-to framebuffer as a texture. + // GX2SetPixelTexture(nullptr, 1); // TODO: what is the correct way to unbind a texture ? + gstate_c.skipDrawReason |= SKIPDRAW_BAD_FB_TEXTURE; + return; + } +} + +void FramebufferManagerGX2::UpdateDownloadTempBuffer(VirtualFramebuffer *nvfb) { + // Nothing to do here. +} + +void FramebufferManagerGX2::SimpleBlit( + Draw::Framebuffer *dest, float destX1, float destY1, float destX2, float destY2, + Draw::Framebuffer *src, float srcX1, float srcY1, float srcX2, float srcY2, bool linearFilter) { + + int destW, destH, srcW, srcH; + draw_->GetFramebufferDimensions(src, &srcW, &srcH); + draw_->GetFramebufferDimensions(dest, &destW, &destH); + + if (srcW == destW && srcH == destH && destX2 - destX1 == srcX2 - srcX1 && destY2 - destY1 == srcY2 - srcY1) { + // Optimize to a copy + draw_->CopyFramebufferImage(src, 0, (int)srcX1, (int)srcY1, 0, dest, 0, (int)destX1, (int)destY1, 0, (int)(srcX2 - srcX1), (int)(srcY2 - srcY1), 1, Draw::FB_COLOR_BIT, "SimpleBlit"); + return; + } + + float dX = 1.0f / (float)destW; + float dY = 1.0f / (float)destH; + float sX = 1.0f / (float)srcW; + float sY = 1.0f / (float)srcH; + struct Vtx { + float x, y, z, u, v; + }; + Vtx vtx[4] = { + { -1.0f + 2.0f * dX * destX1, 1.0f - 2.0f * dY * destY1, 0.0f, sX * srcX1, sY * srcY1 }, + { -1.0f + 2.0f * dX * destX2, 1.0f - 2.0f * dY * destY1, 0.0f, sX * srcX2, sY * srcY1 }, + { -1.0f + 2.0f * dX * destX1, 1.0f - 2.0f * dY * destY2, 0.0f, sX * srcX1, sY * srcY2 }, + { -1.0f + 2.0f * dX * destX2, 1.0f - 2.0f * dY * destY2, 0.0f, sX * srcX2, sY * srcY2 }, + }; + + memcpy(quadBuffer_, vtx, 4 * sizeof(Vtx)); + GX2Invalidate(GX2_INVALIDATE_MODE_ATTRIBUTE_BUFFER, quadBuffer_, 4 * sizeof(Vtx)); + + // Unbind the texture first to avoid the GX2 hazard check (can't set render target to things bound as textures and vice versa, not even temporarily). + draw_->BindTexture(0, nullptr); + draw_->BindFramebufferAsRenderTarget(dest, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::KEEP }, "SimpleBlit"); + draw_->BindFramebufferAsTexture(src, 0, Draw::FB_COLOR_BIT, 0); + + Bind2DShader(); + GX2SetViewport( 0.0f, 0.0f, (float)destW, (float)destH, 0.0f, 1.0f ); + GX2SetScissor(0, 0, destW, destH); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetColorControlReg(&StockGX2::blendDisabledColorWrite); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0xF]); + GX2SetDepthStencilControlReg(&StockGX2::depthStencilDisabled); + GX2SetPixelSampler(linearFilter ? &StockGX2::samplerLinear2DClamp : &StockGX2::samplerPoint2DClamp, 0); + GX2SetAttribBuffer(0, 4 * sizeof(Vtx), sizeof(Vtx), quadBuffer_); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE); +} + +void FramebufferManagerGX2::BlitFramebuffer(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp) { + if (!dst->fbo || !src->fbo || !useBufferedRendering_) { + // This can happen if they recently switched from non-buffered. + if (useBufferedRendering_) { + draw_->BindFramebufferAsRenderTarget(nullptr, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::KEEP }, "BlitFramebuffer_Fail"); + } + return; + } + + float srcXFactor = (float)src->renderWidth / (float)src->bufferWidth; + float srcYFactor = (float)src->renderHeight / (float)src->bufferHeight; + const int srcBpp = src->format == GE_FORMAT_8888 ? 4 : 2; + if (srcBpp != bpp && bpp != 0) { + srcXFactor = (srcXFactor * bpp) / srcBpp; + } + int srcX1 = srcX * srcXFactor; + int srcX2 = (srcX + w) * srcXFactor; + int srcY1 = srcY * srcYFactor; + int srcY2 = (srcY + h) * srcYFactor; + + float dstXFactor = (float)dst->renderWidth / (float)dst->bufferWidth; + float dstYFactor = (float)dst->renderHeight / (float)dst->bufferHeight; + const int dstBpp = dst->format == GE_FORMAT_8888 ? 4 : 2; + if (dstBpp != bpp && bpp != 0) { + dstXFactor = (dstXFactor * bpp) / dstBpp; + } + int dstX1 = dstX * dstXFactor; + int dstX2 = (dstX + w) * dstXFactor; + int dstY1 = dstY * dstYFactor; + int dstY2 = (dstY + h) * dstYFactor; + + // Direct3D doesn't support rect -> self. + Draw::Framebuffer *srcFBO = src->fbo; + if (src == dst) { + Draw::Framebuffer *tempFBO = GetTempFBO(TempFBO::BLIT, src->renderWidth, src->renderHeight, (Draw::FBColorDepth)src->colorDepth); + SimpleBlit(tempFBO, dstX1, dstY1, dstX2, dstY2, src->fbo, srcX1, srcY1, srcX2, srcY2, false); + srcFBO = tempFBO; + } + SimpleBlit(dst->fbo, dstX1, dstY1, dstX2, dstY2, srcFBO, srcX1, srcY1, srcX2, srcY2, false); +} + +// Nobody calls this yet. +void FramebufferManagerGX2::PackDepthbuffer(VirtualFramebuffer *vfb, int x, int y, int w, int h) { + if (!vfb->fbo) { + ERROR_LOG_REPORT_ONCE(vfbfbozero, SCEGE, "PackDepthbuffer: vfb->fbo == 0"); + return; + } + + const u32 z_address = vfb->z_address; + // TODO +} + +void FramebufferManagerGX2::EndFrame() {} + +void FramebufferManagerGX2::DeviceLost() { DestroyAllFBOs(); } + diff --git a/GPU/GX2/FramebufferManagerGX2.h b/GPU/GX2/FramebufferManagerGX2.h new file mode 100644 index 000000000000..b2c5fe69ebe2 --- /dev/null +++ b/GPU/GX2/FramebufferManagerGX2.h @@ -0,0 +1,93 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include +#include + +#include + +// Keeps track of allocated FBOs. +// Also provides facilities for drawing and later converting raw +// pixel data. + +#include "GPU/GPUCommon.h" +#include "GPU/Common/FramebufferManagerCommon.h" +#include "Core/Config.h" +#include "ext/native/thin3d/thin3d.h" + +class TextureCacheGX2; +class DrawEngineGX2; +class ShaderManagerGX2; + +class FramebufferManagerGX2 : public FramebufferManagerCommon { +public: + FramebufferManagerGX2(Draw::DrawContext *draw); + ~FramebufferManagerGX2(); + + void SetTextureCache(TextureCacheGX2 *tc); + void SetShaderManager(ShaderManagerGX2 *sm); + void SetDrawEngine(DrawEngineGX2 *td); + void DrawActiveTexture(float x, float y, float w, float h, float destW, float destH, float u0, float v0, float u1, float v1, int uvRotation, int flags) override; + + void EndFrame(); + void DeviceLost(); + void ReformatFramebufferFrom(VirtualFramebuffer *vfb, GEBufferFormat old) override; + + void BlitFramebufferDepth(VirtualFramebuffer *src, VirtualFramebuffer *dst) override; + + void BindFramebufferAsColorTexture(int stage, VirtualFramebuffer *framebuffer, int flags); + + virtual bool NotifyStencilUpload(u32 addr, int size, StencilUpload flags = StencilUpload::NEEDS_CLEAR) override; + + // TODO: Remove + void *GetDynamicQuadBuffer() { return quadBuffer_; } + +protected: + // Used by ReadFramebufferToMemory and later framebuffer block copies + void BlitFramebuffer(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp) override; + + void UpdateDownloadTempBuffer(VirtualFramebuffer *nvfb) override; + +private: + void Bind2DShader() override; + void PackDepthbuffer(VirtualFramebuffer *vfb, int x, int y, int w, int h); + void SimpleBlit(Draw::Framebuffer *dest, float destX1, float destY1, float destX2, float destY2, Draw::Framebuffer *src, float srcX1, float srcY1, float srcX2, float srcY2, bool linearFilter); + + GX2ContextState *context_; + + GX2FetchShader quadFetchShader_ = {}; + static float fsQuadBuffer_[20]; + const u32 quadStride_ = sizeof(fsQuadBuffer_) / 4; + // Dynamic + float *quadBuffer_; + + int plainColorLoc_; + struct __attribute__((aligned(64))) StencilValueUB { + u32_le u_stencilValue[4]; + }; + StencilValueUB *stencilValueBuffer_ = nullptr; + GX2StencilMaskReg stencilMaskStates_[256]{}; + + TextureCacheGX2 *textureCacheGX2_; + ShaderManagerGX2 *shaderManagerGX2_; + DrawEngineGX2 *drawEngineGX2_; + + static const GX2AttribStream g_QuadAttribStream[2]; +}; diff --git a/GPU/GX2/GPU_GX2.cpp b/GPU/GX2/GPU_GX2.cpp new file mode 100644 index 000000000000..35619162b958 --- /dev/null +++ b/GPU/GX2/GPU_GX2.cpp @@ -0,0 +1,351 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "GPU/GX2/GPU_GX2.h" + +#include + +#include "Common/Serialize/Serializer.h" +#include "Common/GraphicsContext.h" +#include "base/NativeApp.h" +#include "profiler/profiler.h" +#include "i18n/i18n.h" +#include "Core/Debugger/Breakpoints.h" +#include "Core/MemMapHelpers.h" +#include "Core/MIPS/MIPS.h" +#include "Core/Host.h" +#include "Core/Config.h" +#include "Core/Reporting.h" +#include "Core/System.h" + +#include "GPU/GPUState.h" +#include "GPU/ge_constants.h" +#include "GPU/GeDisasm.h" + +#include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/Debugger/Debugger.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/GPU_GX2.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/DrawEngineGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/GX2Util.h" + +#include "Core/HLE/sceKernelThread.h" +#include "Core/HLE/sceKernelInterrupt.h" +#include "Core/HLE/sceGe.h" + +GPU_GX2::GPU_GX2(GraphicsContext *gfxCtx, Draw::DrawContext *draw) + : GPUCommon(gfxCtx, draw), drawEngine_(draw, + (GX2ContextState *)draw->GetNativeObject(Draw::NativeObject::CONTEXT)) { + context_ = (GX2ContextState *)draw->GetNativeObject(Draw::NativeObject::CONTEXT); + lastVsync_ = g_Config.bVSync ? 1 : 0; + + StockGX2::Init(); + + shaderManagerGX2_ = new ShaderManagerGX2(draw, context_); + framebufferManagerGX2_ = new FramebufferManagerGX2(draw); + framebufferManager_ = framebufferManagerGX2_; + textureCacheGX2_ = new TextureCacheGX2(draw); + textureCache_ = textureCacheGX2_; + drawEngineCommon_ = &drawEngine_; + shaderManager_ = shaderManagerGX2_; + depalShaderCache_ = new DepalShaderCacheGX2(draw); + drawEngine_.SetShaderManager(shaderManagerGX2_); + drawEngine_.SetTextureCache(textureCacheGX2_); + drawEngine_.SetFramebufferManager(framebufferManagerGX2_); + framebufferManagerGX2_->SetTextureCache(textureCacheGX2_); + framebufferManagerGX2_->SetShaderManager(shaderManagerGX2_); + framebufferManagerGX2_->SetDrawEngine(&drawEngine_); + framebufferManagerGX2_->Init(); + textureCacheGX2_->SetFramebufferManager(framebufferManagerGX2_); + textureCacheGX2_->SetDepalShaderCache(depalShaderCache_); + textureCacheGX2_->SetShaderManager(shaderManagerGX2_); + + // Sanity check gstate + if ((int *)&gstate.transferstart - (int *)&gstate != 0xEA) { + ERROR_LOG(G3D, "gstate has drifted out of sync!"); + } + + // No need to flush before the tex scale/offset commands if we are baking + // the tex scale/offset into the vertices anyway. + UpdateCmdInfo(); + CheckGPUFeatures(); + + BuildReportingInfo(); + + // Some of our defaults are different from hw defaults, let's assert them. + // We restore each frame anyway, but here is convenient for tests. + textureCache_->NotifyConfigChanged(); +} + +GPU_GX2::~GPU_GX2() { + delete depalShaderCache_; + framebufferManagerGX2_->DestroyAllFBOs(); + delete framebufferManagerGX2_; + shaderManagerGX2_->ClearShaders(); + delete shaderManagerGX2_; + delete textureCacheGX2_; + draw_->BindPipeline(nullptr); +} + +void GPU_GX2::CheckGPUFeatures() { + u32 features = 0; + + features |= GPU_SUPPORTS_BLEND_MINMAX; + features |= GPU_PREFER_CPU_DOWNLOAD; + + // Accurate depth is required on AMD/nVidia (for reverse Z) so we ignore the compat flag to disable it on those. See #9545 + if (!PSP_CoreParameter().compat.flags().DisableAccurateDepth) { + features |= GPU_SUPPORTS_ACCURATE_DEPTH; // Breaks text in PaRappa for some reason. + } + + features |= GPU_SUPPORTS_ANISOTROPY; + features |= GPU_SUPPORTS_OES_TEXTURE_NPOT; + features |= GPU_SUPPORTS_DUALSOURCE_BLEND; + features |= GPU_SUPPORTS_ANY_COPY_IMAGE; + features |= GPU_SUPPORTS_TEXTURE_FLOAT; + features |= GPU_SUPPORTS_INSTANCE_RENDERING; + features |= GPU_SUPPORTS_TEXTURE_LOD_CONTROL; + features |= GPU_SUPPORTS_FBO; + features |= GPU_SUPPORTS_16BIT_FORMATS; + features |= GPU_SUPPORTS_LOGIC_OP; + + if (!g_Config.bHighQualityDepth && (features & GPU_SUPPORTS_ACCURATE_DEPTH) != 0) { + features |= GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT; + } else if (PSP_CoreParameter().compat.flags().PixelDepthRounding) { + // Use fragment rounding on desktop and GLES3, most accurate. + features |= GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT; + } else if (PSP_CoreParameter().compat.flags().VertexDepthRounding) { + features |= GPU_ROUND_DEPTH_TO_16BIT; + } + + // The Phantasy Star hack :( + if (PSP_CoreParameter().compat.flags().DepthRangeHack && (features & GPU_SUPPORTS_ACCURATE_DEPTH) == 0) { + features |= GPU_USE_DEPTH_RANGE_HACK; + } + + if (PSP_CoreParameter().compat.flags().ClearToRAM) { + features |= GPU_USE_CLEAR_RAM_HACK; + } + + gstate_c.featureFlags = features; +} + +// Needs to be called on GPU thread, not reporting thread. +void GPU_GX2::BuildReportingInfo() { + using namespace Draw; + DrawContext *thin3d = gfxCtx_->GetDrawContext(); + + reportingPrimaryInfo_ = thin3d->GetInfoString(InfoField::VENDORSTRING); + reportingFullInfo_ = reportingPrimaryInfo_ + " - " + System_GetProperty(SYSPROP_GPUDRIVER_VERSION) + " - " + thin3d->GetInfoString(InfoField::SHADELANGVERSION); +} + +void GPU_GX2::DeviceLost() { + // Simply drop all caches and textures. + // FBOs appear to survive? Or no? + shaderManagerGX2_->ClearShaders(); + drawEngine_.ClearInputLayoutMap(); + textureCacheGX2_->Clear(false); + framebufferManagerGX2_->DeviceLost(); +} + +void GPU_GX2::DeviceRestore() { + // Nothing needed. +} + +void GPU_GX2::InitClear() { + if (!framebufferManager_->UseBufferedRendering()) { + // device_->Clear(0, NULL, D3DCLEAR_STENCIL | D3DCLEAR_TARGET | D3DCLEAR_ZBUFFER, D3DCOLOR_XRGB(0, 0, 0), 1.f, 0); + } +} + +void GPU_GX2::BeginHostFrame() { + GPUCommon::BeginHostFrame(); + UpdateCmdInfo(); + if (resized_) { + CheckGPUFeatures(); + framebufferManager_->Resized(); + drawEngine_.Resized(); + textureCacheGX2_->NotifyConfigChanged(); + shaderManagerGX2_->DirtyLastShader(); + resized_ = false; + } +} + +void GPU_GX2::ReapplyGfxState() { + GPUCommon::ReapplyGfxState(); + + // TODO: Dirty our caches for depth states etc +} + +void GPU_GX2::EndHostFrame() { + // Tell the DrawContext that it's time to reset everything. + draw_->BindPipeline(nullptr); +} + +void GPU_GX2::BeginFrame() { + GPUCommon::BeginFrame(); + + textureCacheGX2_->StartFrame(); + drawEngine_.BeginFrame(); + depalShaderCache_->Decimate(); + // fragmentTestCache_.Decimate(); + + shaderManagerGX2_->DirtyLastShader(); + + framebufferManagerGX2_->BeginFrame(); + gstate_c.Dirty(DIRTY_PROJTHROUGHMATRIX); +} + +void GPU_GX2::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) { + // TODO: Some games like Spongebob - Yellow Avenger, never change framebuffer, they blit to it. + // So breaking on frames doesn't work. Might want to move this to sceDisplay vsync. + GPUDebug::NotifyDisplay(framebuf, stride, format); + framebufferManagerGX2_->SetDisplayFramebuffer(framebuf, stride, format); +} + +void GPU_GX2::CopyDisplayToOutput(bool reallyDirty) { + GX2SetColorControlReg(&StockGX2::blendDisabledColorWrite); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0xF]); + + drawEngine_.Flush(); + + framebufferManagerGX2_->CopyDisplayToOutput(reallyDirty); + framebufferManagerGX2_->EndFrame(); + + // shaderManager_->EndFrame(); + shaderManagerGX2_->DirtyLastShader(); + + gstate_c.Dirty(DIRTY_TEXTURE_IMAGE); +} + +void GPU_GX2::FinishDeferred() { + // This finishes reading any vertex data that is pending. + drawEngine_.FinishDeferred(); +} + +inline void GPU_GX2::CheckFlushOp(int cmd, u32 diff) { + const u8 cmdFlags = cmdInfo_[cmd].flags; + if (diff && (cmdFlags & FLAG_FLUSHBEFOREONCHANGE)) { + if (dumpThisFrame_) { + NOTICE_LOG(G3D, "================ FLUSH ================"); + } + drawEngine_.Flush(); + } +} + +void GPU_GX2::PreExecuteOp(u32 op, u32 diff) { + CheckFlushOp(op >> 24, diff); +} + +void GPU_GX2::ExecuteOp(u32 op, u32 diff) { + const u8 cmd = op >> 24; + const CommandInfo info = cmdInfo_[cmd]; + const u8 cmdFlags = info.flags; + if ((cmdFlags & FLAG_EXECUTE) || (diff && (cmdFlags & FLAG_EXECUTEONCHANGE))) { + (this->*info.func)(op, diff); + } else if (diff) { + uint64_t dirty = info.flags >> 8; + if (dirty) + gstate_c.Dirty(dirty); + } +} + +void GPU_GX2::GetStats(char *buffer, size_t bufsize) { + float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f; + snprintf(buffer, bufsize - 1, + "DL processing time: %0.2f ms\n" + "Draw calls: %i, flushes %i, clears %i\n" + "Cached Draw calls: %i\n" + "Num Tracked Vertex Arrays: %i\n" + "GPU cycles executed: %d (%f per vertex)\n" + "Commands per call level: %i %i %i %i\n" + "Vertices submitted: %i\n" + "Cached, Uncached Vertices Drawn: %i, %i\n" + "FBOs active: %i\n" + "Textures active: %i, decoded: %i invalidated: %i\n" + "Readbacks: %d, uploads: %d\n" + "Vertex, Fragment shaders loaded: %i, %i\n", + gpuStats.msProcessingDisplayLists * 1000.0f, + gpuStats.numDrawCalls, + gpuStats.numFlushes, + gpuStats.numClears, + gpuStats.numCachedDrawCalls, + gpuStats.numTrackedVertexArrays, + gpuStats.vertexGPUCycles + gpuStats.otherGPUCycles, + vertexAverageCycles, + gpuStats.gpuCommandsAtCallLevel[0], gpuStats.gpuCommandsAtCallLevel[1], gpuStats.gpuCommandsAtCallLevel[2], gpuStats.gpuCommandsAtCallLevel[3], + gpuStats.numVertsSubmitted, + gpuStats.numCachedVertsDrawn, + gpuStats.numUncachedVertsDrawn, + (int)framebufferManagerGX2_->NumVFBs(), + (int)textureCacheGX2_->NumLoadedTextures(), + gpuStats.numTexturesDecoded, + gpuStats.numTextureInvalidations, + gpuStats.numReadbacks, + gpuStats.numUploads, + shaderManagerGX2_->GetNumVertexShaders(), + shaderManagerGX2_->GetNumFragmentShaders() + ); +} + +void GPU_GX2::ClearCacheNextFrame() { + textureCacheGX2_->ClearNextFrame(); +} + +void GPU_GX2::ClearShaderCache() { + shaderManagerGX2_->ClearShaders(); + drawEngine_.ClearInputLayoutMap(); +} + +void GPU_GX2::DoState(PointerWrap &p) { + GPUCommon::DoState(p); + + // TODO: Some of these things may not be necessary. + // None of these are necessary when saving. + if (p.mode == p.MODE_READ && !PSP_CoreParameter().frozen) { + textureCache_->Clear(true); + depalShaderCache_->Clear(); + drawEngine_.ClearTrackedVertexArrays(); + + gstate_c.Dirty(DIRTY_TEXTURE_IMAGE); + framebufferManager_->DestroyAllFBOs(); + } +} + +std::vector GPU_GX2::DebugGetShaderIDs(DebugShaderType type) { + switch (type) { + case SHADER_TYPE_VERTEXLOADER: + return drawEngine_.DebugGetVertexLoaderIDs(); + case SHADER_TYPE_DEPAL: + return depalShaderCache_->DebugGetShaderIDs(type); + default: + return shaderManagerGX2_->DebugGetShaderIDs(type); + } +} + +std::string GPU_GX2::DebugGetShaderString(std::string id, DebugShaderType type, DebugShaderStringType stringType) { + switch (type) { + case SHADER_TYPE_VERTEXLOADER: + return drawEngine_.DebugGetVertexLoaderString(id, stringType); + case SHADER_TYPE_DEPAL: + return depalShaderCache_->DebugGetShaderString(id, type, stringType); + default: + return shaderManagerGX2_->DebugGetShaderString(id, type, stringType); + } +} diff --git a/GPU/GX2/GPU_GX2.h b/GPU/GX2/GPU_GX2.h new file mode 100644 index 000000000000..0b7f8fc34219 --- /dev/null +++ b/GPU/GX2/GPU_GX2.h @@ -0,0 +1,86 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include +#include + +#include "GPU/GPUCommon.h" +#include "GPU/GX2/DrawEngineGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/DepalettizeShaderGX2.h" +#include "GPU/Common/VertexDecoderCommon.h" + +class FramebufferManagerGX2; +class ShaderManagerGX2; +class LinkedShaderGX2; + +class GPU_GX2 : public GPUCommon { +public: + GPU_GX2(GraphicsContext *gfxCtx, Draw::DrawContext *draw); + ~GPU_GX2(); + + void CheckGPUFeatures() override; + void PreExecuteOp(u32 op, u32 diff) override; + void ExecuteOp(u32 op, u32 diff) override; + + void ReapplyGfxState() override; + void SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) override; + void GetStats(char *buffer, size_t bufsize) override; + void ClearCacheNextFrame() override; + void DeviceLost() override; // Only happens on Android. Drop all textures and shaders. + void DeviceRestore() override; + + void DoState(PointerWrap &p) override; + + void ClearShaderCache() override; + + // Using string because it's generic - makes no assumptions on the size of the shader IDs of this backend. + std::vector DebugGetShaderIDs(DebugShaderType shader) override; + std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override; + + void BeginHostFrame() override; + void EndHostFrame() override; + +protected: + void FinishDeferred() override; + +private: + void Flush() { + drawEngine_.Flush(); + } + // void ApplyDrawState(int prim); + void CheckFlushOp(int cmd, u32 diff); + void BuildReportingInfo(); + + void InitClear() override; + void BeginFrame() override; + void CopyDisplayToOutput(bool reallyDirty) override; + + GX2ContextState *context_; + + FramebufferManagerGX2 *framebufferManagerGX2_; + TextureCacheGX2 *textureCacheGX2_; + DepalShaderCacheGX2 *depalShaderCache_; + DrawEngineGX2 drawEngine_; + ShaderManagerGX2 *shaderManagerGX2_; + + int lastVsync_; + int vertexCost_ = 0; +}; diff --git a/GPU/GX2/GX2Shaders.c b/GPU/GX2/GX2Shaders.c new file mode 100644 index 000000000000..ba6fc173bf2a --- /dev/null +++ b/GPU/GX2/GX2Shaders.c @@ -0,0 +1,5902 @@ + +#include "GPU/GX2/GX2Shaders.h" +#include +#include + +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static u64 depalVCode [32] = +{ + CALL_FS NO_BARRIER, + EXP_DONE(POS0, _R1, _x, _y, _z, _1), + EXP_DONE(PARAM0, _R2, _x, _y, _0, _0) NO_BARRIER + END_OF_PROGRAM +}; +// clang-format on + +GX2VertexShader defVShaderGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 3, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 1, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x3, + .num_sq_vtx_semantic = 2, + { 0, 1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + sizeof(depalVCode), + (uint8_t *)&depalVCode, + GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) static struct { + u64 cf[32]; + u64 tex[1 * 2]; +} quadPCode = { + { + TEX(32, 1) VALID_PIX, + EXP_DONE(PIX0, _R0, _r, _g, _b, _a) + END_OF_PROGRAM + }, + { + TEX_SAMPLE(_R0, _r, _g, _b, _a, _R0, _x, _y, _0, _0, _t0, _s0) + } +}; +// clang-format on + +GX2PixelShader defPShaderGX2 = { + { + .sq_pgm_resources_ps.num_gprs = 1, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 1, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 1, + { { .semantic = 0, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + }, /* regs */ + sizeof(quadPCode), + (uint8_t *)&quadPCode, + GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[32]; +} stencilVCode = +{ + { + CALL_FS NO_BARRIER, + EXP_DONE(POS0, _R1,_x,_y,_z,_w), + EXP_DONE(PARAM0, _R2,_x,_y,_0,_0) NO_BARRIER + END_OF_PROGRAM + }, +}; + +GX2VertexShader stencilUploadVSshaderGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 3, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 0, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x3, + .num_sq_vtx_semantic = 2, + { 0, 1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + sizeof(stencilVCode), + (uint8_t *)&stencilVCode, + GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[32]; + u64 alu[2]; /* 32 */ + u64 alu1[9]; /* 34 */ + u64 alu2[5]; /* 43 */ + u64 tex3[1 * 2]; /* 48 */ +} stencilPCode = +{ + { + ALU_PUSH_BEFORE(32,2) KCACHE0(CB1, _0_15), + JUMP(0,4) VALID_PIX, + TEX(48,1) VALID_PIX, + ALU(34,9) KCACHE0(CB1, _0_15), + ELSE(1, 6) VALID_PIX, + ALU_POP_AFTER(43,1), + EXP_DONE(PIX0, _R0,_x,_x,_x,_x) + END_OF_PROGRAM + }, + { + /* 0 */ + ALU_SETE_INT(_R0,_w, KC0(0),_x, ALU_SRC_0,_x) + ALU_LAST, + /* 1 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 2 */ + ALU_MOV(_R0,_x, _R0,_w), + ALU_MUL(__,_y, _R0,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x437FFD71), + /* 3 */ + ALU_FLOOR(__,_x, ALU_SRC_PV,_y) + ALU_LAST, + /* 4 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 5 */ + ALU_AND_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x000000FF), + /* 6 */ + ALU_AND_INT(__,_y, KC0(0),_x, ALU_SRC_PV,_z) + ALU_LAST, + /* 7 */ + ALU_KILLE_INT(__,_x, ALU_SRC_PV,_y, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 8 */ + ALU_MOV(_R0,_x, ALU_SRC_0,_x) + ALU_LAST, + }, + { + TEX_SAMPLE(_R0,_m,_m,_m,_w, _R0,_x,_y,_0,_x, _t0, _s0), + }, +}; +// clang-format on + +GX2PixelShader stencilUploadPSshaderGX2 = { + { + .sq_pgm_resources_ps.num_gprs = 1, + .sq_pgm_resources_ps.stack_size = 1, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 1, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 1, + { { .semantic = 0, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + .db_shader_control.kill_enable = TRUE, + }, /* regs */ + sizeof(stencilPCode), + (uint8_t *)&stencilPCode, + GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct { + u64 cf[32]; + u64 alu[16]; +} vsSTCode = { + { + CALL_FS NO_BARRIER, + ALU(32, 16) KCACHE0(CB1, _0_15), + EXP_DONE(POS0, _R1, _x, _y, _z, _w), + EXP(PARAM0, _R2, _x, _y, _0, _0) NO_BARRIER, + EXP_DONE(PARAM1, _R3, _r, _g, _b, _a) NO_BARRIER + END_OF_PROGRAM + }, + { + ALU_MUL(__, _x, _R1, _w, KC0(3), _y), + ALU_MUL(__, _y, _R1, _w, KC0(3), _x), + ALU_MUL(__, _z, _R1, _w, KC0(3), _w), + ALU_MUL(__, _w, _R1, _w, KC0(3), _z) + ALU_LAST, + ALU_MULADD(_R123, _x, _R1, _z, KC0(2), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R123, _y, _R1, _z, KC0(2), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R123, _z, _R1, _z, KC0(2), _w, ALU_SRC_PV, _z), + ALU_MULADD(_R123, _w, _R1, _z, KC0(2), _z, ALU_SRC_PV, _w) + ALU_LAST, + ALU_MULADD(_R123, _x, _R1, _y, KC0(1), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R123, _y, _R1, _y, KC0(1), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R123, _z, _R1, _y, KC0(1), _w, ALU_SRC_PV, _z), + ALU_MULADD(_R123, _w, _R1, _y, KC0(1), _z, ALU_SRC_PV, _w) + ALU_LAST, + ALU_MULADD(_R1, _x, _R1, _x, KC0(0), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R1, _y, _R1, _x, KC0(0), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R1, _z, _R1, _x, KC0(0), _z, ALU_SRC_PV, _w), + ALU_MULADD(_R1, _w, _R1, _x, KC0(0), _w, ALU_SRC_PV, _z) + ALU_LAST, + } +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) static struct { + u64 cf[32]; + u64 alu[16]; + u64 tex[1 * 2]; +} fsSTCode = +{ + { + TEX(48, 1) VALID_PIX, + ALU(32, 4), + EXP_DONE(PIX0, _R0, _r, _g, _b, _a) + END_OF_PROGRAM + }, + { + ALU_MUL(_R0, _r, _R0, _r, _R1, _r), + ALU_MUL(_R0, _g, _R0, _g, _R1, _g), + ALU_MUL(_R0, _b, _R0, _b, _R1, _b), + ALU_MUL(_R0, _a, _R0, _a, _R1, _a) + ALU_LAST + }, + { + TEX_SAMPLE(_R0, _r, _g, _b, _a, _R0, _x, _y, _0, _0, _t0, _s0) + } +}; +// clang-format on + +GX2VertexShader STVshaderGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 4, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 1, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0x01, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x7, + .num_sq_vtx_semantic = 3, + { + 0, 1, 2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(vsSTCode), + .program = (u8 *)&vsSTCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +GX2PixelShader STPshaderGX2 = { + { + .sq_pgm_resources_ps.num_gprs = 2, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 2, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 2, + { { .semantic = 0, .default_val = 1 }, { .semantic = 1, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + }, /* regs */ + .size = sizeof(fsSTCode), + .program = (uint8_t *)&fsSTCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[32]; + u64 alu[4]; /* 32 */ + u64 alu1[2]; /* 36 */ + u64 alu2[2]; /* 38 */ + u64 alu3[5]; /* 40 */ + u64 alu4[1]; /* 45 */ + u64 alu5[3]; /* 46 */ + u64 alu6[4]; /* 49 */ + u64 alu7[1]; /* 53 */ + u64 alu8[16]; /* 54 */ + u64 alu9[16]; /* 70 */ + u64 alu10[1]; /* 86 */ + u64 alu11[8]; /* 87 */ + u64 alu12[2]; /* 95 */ +} VShaderSWCode = +{ + { + CALL_FS NO_BARRIER, + ALU_PUSH_BEFORE(32,4) KCACHE0(CB4, _0_15), + JUMP(1, 7), + ALU_PUSH_BEFORE(36,2) KCACHE0(CB4, _0_15), + JUMP(1, 6), + ALU_POP_AFTER(38,2) KCACHE0(CB4, _0_15), + ALU_POP_AFTER(40,5), + ALU_PUSH_BEFORE(45,1) KCACHE0(CB4, _0_15), + JUMP(0,10), + ALU(46,3) KCACHE0(CB4, _0_15), + ELSE(1, 12), + ALU_POP_AFTER(49,4) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(53,1) KCACHE0(CB4, _0_15), + JUMP(0,15), + ALU(54,16) KCACHE0(CB1, _0_15), + ELSE(1, 17), + ALU_POP_AFTER(70,16) KCACHE0(CB1, _0_15), + ALU_PUSH_BEFORE(86,1) KCACHE0(CB4, _0_15), + JUMP(1, 20), + ALU_POP_AFTER(87,8) KCACHE0(CB1, _16_31), + ALU(95,2), + EXP_DONE(POS0, _R2,_x,_y,_z,_w), + EXP(PARAM0, _R1,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM1, _R5,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM2, _R0,_x,_y,_z,_w) NO_BARRIER, + EXP_DONE(PARAM3, _R3,_x,_y,_y,_y) NO_BARRIER + END_OF_PROGRAM + }, + { + /* 0 */ + ALU_MOV(_R5,_x, ALU_SRC_0,_x), + ALU_MOV(_R5,_y, ALU_SRC_0,_x), + ALU_MOV(_R5,_z, ALU_SRC_0,_x) + ALU_LAST, + /* 1 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 2 */ + ALU_MOV(_R0,_w, ALU_SRC_0,_x) + ALU_LAST, + /* 3 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 4 */ + ALU_NOT_INT(__,_x, KC0(4),_y) + ALU_LAST, + /* 5 */ + ALU_CNDE_INT(_R0,_w, ALU_SRC_PV,_x, ALU_SRC_0,_x, ALU_SRC_M_1_INT,_x) + ALU_LAST, + }, + { + /* 6 */ + ALU_CNDE_INT(_R0,_x, _R0,_w, _R4,_x, _R4,_x), + ALU_CNDE_INT(_R0,_y, _R0,_w, _R4,_y, _R4,_y), + ALU_MOV(_R0,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 7 */ + ALU_CNDE_INT(_R0,_z, _R0,_w, ALU_SRC_PV,_z, _R4,_z) + ALU_LAST, + }, + { + /* 8 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 9 */ + ALU_CNDE_INT(_R5,_x, KC0(4),_x, ALU_SRC_0,_x, _R2,_x), + ALU_CNDE_INT(_R5,_y, KC0(4),_x, ALU_SRC_0,_x, _R2,_y), + ALU_CNDE_INT(_R5,_z, KC0(4),_x, ALU_SRC_0,_x, _R2,_z) + ALU_LAST, + }, + { + /* 10 */ + ALU_MOV(_R1,_x, KC0(4),_x), + ALU_MOV(_R1,_y, KC0(4),_y), + ALU_MOV(_R1,_z, KC0(4),_z), + ALU_MOV(_R1,_w, KC0(4),_w) + ALU_LAST, + }, + { + /* 11 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 12 */ + ALU_MUL(__,_x, KC0(7),_w, ALU_SRC_1,_x), + ALU_MUL(__,_y, KC0(7),_z, ALU_SRC_1,_x), + ALU_MUL(__,_z, KC0(7),_y, ALU_SRC_1,_x), + ALU_MUL(__,_w, KC0(7),_x, ALU_SRC_1,_x) + ALU_LAST, + /* 13 */ + ALU_MULADD(_R123,_x, _R3,_z, KC0(6),_w, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R3,_z, KC0(6),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R3,_z, KC0(6),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R3,_z, KC0(6),_x, ALU_SRC_PV,_w) + ALU_LAST, + /* 14 */ + ALU_MULADD(_R123,_x, _R3,_y, KC0(5),_w, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R3,_y, KC0(5),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R3,_y, KC0(5),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R3,_y, KC0(5),_x, ALU_SRC_PV,_w) + ALU_LAST, + /* 15 */ + ALU_MULADD(_R2,_x, _R3,_x, KC0(4),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R2,_y, _R3,_x, KC0(4),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R2,_z, _R3,_x, KC0(4),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R2,_w, _R3,_x, KC0(4),_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 16 */ + ALU_MUL(__,_x, KC0(3),_w, ALU_SRC_1,_x), + ALU_MUL(__,_y, KC0(3),_z, ALU_SRC_1,_x), + ALU_MUL(__,_z, KC0(3),_y, ALU_SRC_1,_x), + ALU_MUL(__,_w, KC0(3),_x, ALU_SRC_1,_x) + ALU_LAST, + /* 17 */ + ALU_MULADD(_R123,_x, _R3,_z, KC0(2),_w, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R3,_z, KC0(2),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R3,_z, KC0(2),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R3,_z, KC0(2),_x, ALU_SRC_PV,_w) + ALU_LAST, + /* 18 */ + ALU_MULADD(_R123,_x, _R3,_y, KC0(1),_w, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R3,_y, KC0(1),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R3,_y, KC0(1),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R3,_y, KC0(1),_x, ALU_SRC_PV,_w) + ALU_LAST, + /* 19 */ + ALU_MULADD(_R2,_x, _R3,_x, KC0(0),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R2,_y, _R3,_x, KC0(0),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R2,_z, _R3,_x, KC0(0),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R2,_w, _R3,_x, KC0(0),_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 20 */ + ALU_PRED_SETNE_INT(__,_x, KC0(10),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 21 */ + ALU_RECIP_IEEE(__,_x, _R2,_w) SCL_210 + ALU_LAST, + /* 22 */ + ALU_MUL_IEEE(__,_y, _R2,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 23 */ + ALU_MUL(__,_x, ALU_SRC_PV,_y, KC0(2),_x) + ALU_LAST, + /* 24 */ + ALU_ADD(__,_z, KC0(2),_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 25 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_z) + ALU_LAST, + /* 26 */ + ALU_ADD(__,_y, KC0(2) _NEG,_z, ALU_SRC_PV,_w) + ALU_LAST, + /* 27 */ + ALU_MUL(__,_x, KC0(2),_w, ALU_SRC_PV,_y) + ALU_LAST, + /* 28 */ + ALU_MUL(_R2,_z, _R2,_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 29 */ + ALU_NOP(__,_x), + ALU_MOV(_R3,_x, _R3,_w) + ALU_LAST, + }, +}; + +GX2VertexShader VShaderSWGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 6, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 3, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0x01, .semantic_2 = 0x03, .semantic_3 = 0x02 }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0xF, + .num_sq_vtx_semantic = 4, + { + 2, 3, 0, 1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(VShaderSWCode), + .program = (u8 *)&VShaderSWCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[320]; + u64 alu[8]; /* 320 */ + u64 alu1[1]; /* 328 */ + u64 alu2[1]; /* 329 */ + u64 alu3[6]; /* 330 */ + u64 alu4[3]; /* 336 */ + u64 alu5[6]; /* 339 */ + u64 alu6[1]; /* 345 */ + u64 alu7[3]; /* 346 */ + u64 alu8[6]; /* 349 */ + u64 alu9[1]; /* 355 */ + u64 alu10[2]; /* 356 */ + u64 alu11[2]; /* 358 */ + u64 alu12[4]; /* 360 */ + u64 alu13[3]; /* 364 */ + u64 alu14[1]; /* 367 */ + u64 alu15[1]; /* 368 */ + u64 alu16[1]; /* 369 */ + u64 alu17[7]; /* 370 */ + u64 alu18[16]; /* 377 */ + u64 alu19[14]; /* 393 */ + u64 alu20[38]; /* 407 */ + u64 alu21[2]; /* 445 */ + u64 alu22[19]; /* 447 */ + u64 alu23[53]; /* 466 */ + u64 alu24[3]; /* 519 */ + u64 alu25[19]; /* 522 */ + u64 alu26[53]; /* 541 */ + u64 alu27[3]; /* 594 */ + u64 alu28[19]; /* 597 */ + u64 alu29[53]; /* 616 */ + u64 alu30[4]; /* 669 */ + u64 alu31[1]; /* 673 */ + u64 alu32[12]; /* 674 */ + u64 alu33[24]; /* 686 */ + u64 alu34[5]; /* 710 */ + u64 alu35[2]; /* 715 */ + u64 alu36[4]; /* 717 */ + u64 alu37[2]; /* 721 */ + u64 alu38[6]; /* 723 */ + u64 alu39[3]; /* 729 */ + u64 alu40[7]; /* 732 */ + u64 alu41[2]; /* 739 */ + u64 alu42[3]; /* 741 */ + u64 alu43[5]; /* 744 */ + u64 alu44[3]; /* 749 */ + u64 alu45[5]; /* 752 */ + u64 alu46[13]; /* 757 */ + u64 alu47[4]; /* 770 */ + u64 alu48[2]; /* 774 */ + u64 alu49[3]; /* 776 */ + u64 alu50[2]; /* 779 */ + u64 alu51[3]; /* 781 */ + u64 alu52[3]; /* 784 */ + u64 alu53[7]; /* 787 */ + u64 alu54[3]; /* 794 */ + u64 alu55[5]; /* 797 */ + u64 alu56[3]; /* 802 */ + u64 alu57[5]; /* 805 */ + u64 alu58[3]; /* 810 */ + u64 alu59[5]; /* 813 */ + u64 alu60[13]; /* 818 */ + u64 alu61[11]; /* 831 */ + u64 alu62[1]; /* 842 */ + u64 alu63[5]; /* 843 */ + u64 alu64[2]; /* 848 */ + u64 alu65[2]; /* 850 */ + u64 alu66[2]; /* 852 */ + u64 alu67[1]; /* 854 */ + u64 alu68[3]; /* 855 */ + u64 alu69[1]; /* 858 */ + u64 alu70[3]; /* 859 */ + u64 alu71[6]; /* 862 */ + u64 alu72[1]; /* 868 */ + u64 alu73[3]; /* 869 */ + u64 alu74[6]; /* 872 */ + u64 alu75[1]; /* 878 */ + u64 alu76[3]; /* 879 */ + u64 alu77[7]; /* 882 */ + u64 alu78[1]; /* 889 */ + u64 alu79[3]; /* 890 */ + u64 alu80[6]; /* 893 */ + u64 alu81[1]; /* 899 */ + u64 alu82[3]; /* 900 */ + u64 alu83[7]; /* 903 */ + u64 alu84[1]; /* 910 */ + u64 alu85[3]; /* 911 */ + u64 alu86[6]; /* 914 */ + u64 alu87[1]; /* 920 */ + u64 alu88[1]; /* 921 */ + u64 alu89[1]; /* 922 */ + u64 alu90[2]; /* 923 */ + u64 alu91[1]; /* 925 */ + u64 alu92[1]; /* 926 */ + u64 alu93[1]; /* 927 */ + u64 alu94[2]; /* 928 */ + u64 alu95[3]; /* 930 */ + u64 alu96[1]; /* 933 */ + u64 alu97[1]; /* 934 */ + u64 alu98[1]; /* 935 */ + u64 alu99[2]; /* 936 */ + u64 alu100[2]; /* 938 */ + u64 alu101[2]; /* 940 */ + u64 alu102[19]; /* 942 */ + u64 alu103[1]; /* 961 */ + u64 alu104[1]; /* 962 */ + u64 alu105[1]; /* 963 */ + u64 alu106[19]; /* 964 */ + u64 alu107[1]; /* 983 */ + u64 alu108[4]; /* 984 */ + u64 alu109[3]; /* 988 */ + u64 alu110[10]; /* 991 */ + u64 alu111[1]; /* 1001 */ + u64 alu112[13]; /* 1002 */ + u64 alu113[5]; /* 1015 */ + u64 alu114[3]; /* 1020 */ + u64 alu115[3]; /* 1023 */ + u64 alu116[3]; /* 1026 */ + u64 alu117[4]; /* 1029 */ + u64 alu118[3]; /* 1033 */ + u64 alu119[4]; /* 1036 */ + u64 alu120[12]; /* 1040 */ + u64 alu121[3]; /* 1052 */ + u64 alu122[3]; /* 1055 */ + u64 alu123[3]; /* 1058 */ + u64 alu124[2]; /* 1061 */ + u64 alu125[2]; /* 1063 */ + u64 alu126[3]; /* 1065 */ + u64 alu127[3]; /* 1068 */ + u64 alu128[3]; /* 1071 */ + u64 alu129[3]; /* 1074 */ + u64 alu130[3]; /* 1077 */ + u64 alu131[3]; /* 1080 */ + u64 alu132[3]; /* 1083 */ + u64 alu133[3]; /* 1086 */ + u64 alu134[3]; /* 1089 */ + u64 alu135[3]; /* 1092 */ + u64 alu136[4]; /* 1095 */ + u64 alu137[3]; /* 1099 */ + u64 alu138[5]; /* 1102 */ + u64 alu139[3]; /* 1107 */ + u64 alu140[4]; /* 1110 */ + u64 alu141[12]; /* 1114 */ + u64 alu142[3]; /* 1126 */ + u64 alu143[4]; /* 1129 */ + u64 alu144[2]; /* 1133 */ + u64 alu145[3]; /* 1135 */ + u64 alu146[3]; /* 1138 */ + u64 alu147[3]; /* 1141 */ + u64 alu148[3]; /* 1144 */ + u64 alu149[3]; /* 1147 */ + u64 alu150[3]; /* 1150 */ + u64 alu151[3]; /* 1153 */ + u64 alu152[3]; /* 1156 */ + u64 alu153[3]; /* 1159 */ + u64 alu154[3]; /* 1162 */ + u64 alu155[4]; /* 1165 */ + u64 alu156[3]; /* 1169 */ + u64 alu157[5]; /* 1172 */ + u64 alu158[3]; /* 1177 */ + u64 alu159[4]; /* 1180 */ + u64 alu160[11]; /* 1184 */ + u64 alu161[3]; /* 1195 */ + u64 alu162[5]; /* 1198 */ + u64 alu163[3]; /* 1203 */ + u64 alu164[2]; /* 1206 */ + u64 alu165[3]; /* 1208 */ + u64 alu166[3]; /* 1211 */ + u64 alu167[3]; /* 1214 */ + u64 alu168[3]; /* 1217 */ + u64 alu169[3]; /* 1220 */ + u64 alu170[20]; /* 1223 */ + u64 alu171[9]; /* 1243 */ + u64 alu172[2]; /* 1252 */ + u64 alu173[2]; /* 1254 */ + u64 alu174[1]; /* 1256 */ + u64 alu175[1]; /* 1257 */ + u64 alu176[3]; /* 1258 */ + u64 alu177[2]; /* 1261 */ + u64 alu178[3]; /* 1263 */ + u64 alu179[2]; /* 1266 */ + u64 alu180[3]; /* 1268 */ + u64 alu181[2]; /* 1271 */ + u64 alu182[10]; /* 1273 */ + u64 alu183[3]; /* 1283 */ + u64 alu184[8]; /* 1286 */ + u64 alu185[2]; /* 1294 */ + u64 alu186[5]; /* 1296 */ + u64 alu187[1]; /* 1301 */ + u64 alu188[1]; /* 1302 */ + u64 alu189[4]; /* 1303 */ + u64 alu190[2]; /* 1307 */ + u64 alu191[4]; /* 1309 */ + u64 alu192[3]; /* 1313 */ + u64 alu193[3]; /* 1316 */ + u64 alu194[1]; /* 1319 */ + u64 alu195[1]; /* 1320 */ + u64 alu196[20]; /* 1321 */ + u64 alu197[19]; /* 1341 */ + u64 tex198[1 * 2]; /* 1360 */ + u64 tex199[3 * 2]; /* 1362 */ + u64 tex200[1 * 2]; /* 1368 */ + u64 tex201[3 * 2]; /* 1370 */ + u64 tex202[1 * 2]; /* 1376 */ + u64 tex203[1 * 2]; /* 1378 */ + u64 tex204[3 * 2]; /* 1380 */ + u64 tex205[1 * 2]; /* 1386 */ +} PShaderAllCode = +{ + { + ALU_PUSH_BEFORE(320,8) KCACHE0(CB5, _0_15), + JUMP(1, 289), + ALU_PUSH_BEFORE(328,1) KCACHE0(CB5, _0_15), + JUMP(1, 131), + ALU_PUSH_BEFORE(329,1) KCACHE0(CB5, _0_15), + JUMP(0,19), + ALU_PUSH_BEFORE(330,6) KCACHE0(CB5, _0_15), + JUMP(0,9), + ALU(336,3) KCACHE0(CB1, _16_31), + ELSE(1, 11), + ALU_POP_AFTER(339,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(345,1) KCACHE0(CB5, _0_15), + JUMP(0,14), + ALU(346,3) KCACHE0(CB1, _16_31), + ELSE(1, 16), + ALU_POP_AFTER(349,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(355,1) KCACHE0(CB5, _0_15), + JUMP(1, 19), + ALU_POP_AFTER(356,2) KCACHE0(CB1, _16_31), + ELSE(1, 21), + ALU_POP_AFTER(358,2), + ALU_PUSH_BEFORE(360,4) KCACHE0(CB5, _0_15), + JUMP(0,29), + ALU(364,3), + TEX(1360,1) VALID_PIX, + ALU_PUSH_BEFORE(367,1) KCACHE0(CB5, _0_15), + JUMP(1, 29) VALID_PIX, + TEX(1362,3) VALID_PIX, + POP(1, 29), + ELSE(1, 35), + TEX(1368,1) VALID_PIX, + ALU_PUSH_BEFORE(368,1) KCACHE0(CB5, _0_15), + JUMP(2, 35) VALID_PIX, + TEX(1370,3) VALID_PIX, + POP(2, 35), + ALU_PUSH_BEFORE(369,1) KCACHE0(CB5, _0_15), + JUMP(1, 71) VALID_PIX, + ALU(370,7) KCACHE0(CB1, _16_31), + TEX(1376,1) VALID_PIX, + ALU_PUSH_BEFORE(377,16) KCACHE0(CB1, _16_31), + JUMP(0,44) VALID_PIX, + ALU_PUSH_BEFORE(393,14), + JUMP(1, 44) VALID_PIX, + ALU_POP_AFTER(407,38), + ELSE(1, 64) VALID_PIX, + ALU_PUSH_BEFORE(445,2), + JUMP(0,50) VALID_PIX, + ALU_PUSH_BEFORE(447,19), + JUMP(1, 50) VALID_PIX, + ALU_POP_AFTER(466,53), + ELSE(1, 63) VALID_PIX, + ALU_PUSH_BEFORE(519,3), + JUMP(0,56) VALID_PIX, + ALU_PUSH_BEFORE(522,19), + JUMP(1, 56) VALID_PIX, + ALU_POP_AFTER(541,53), + ELSE(0,62) VALID_PIX, + ALU_PUSH_BEFORE(594,3), + JUMP(4, 64) VALID_PIX, + ALU_PUSH_BEFORE(597,19), + JUMP(5, 64) VALID_PIX, + ALU_POP2_AFTER(616,53), + POP(2, 63), + POP(1, 64), + ALU(669,4), + TEX(1378,1) VALID_PIX, + ALU_PUSH_BEFORE(673,1), + JUMP(2, 71) VALID_PIX, + ALU(674,12), + TEX(1380,3) VALID_PIX, + ALU_POP2_AFTER(686,24), + ALU_PUSH_BEFORE(710,5) KCACHE0(CB5, _0_15), + JUMP(0,101) VALID_PIX, + ALU_PUSH_BEFORE(715,2) KCACHE0(CB5, _0_15), + JUMP(0,76) VALID_PIX, + ALU(717,4), + ELSE(1, 101) VALID_PIX, + ALU_PUSH_BEFORE(721,2) KCACHE0(CB5, _0_15), + JUMP(0,80) VALID_PIX, + ALU(723,6), + ELSE(1, 100) VALID_PIX, + ALU_PUSH_BEFORE(729,3) KCACHE0(CB5, _0_15), + JUMP(0,84) VALID_PIX, + ALU(732,7) KCACHE0(CB1, _16_31), + ELSE(0,99) VALID_PIX, + ALU_PUSH_BEFORE(739,2) KCACHE0(CB5, _0_15), + JUMP(0,97) VALID_PIX, + ALU_PUSH_BEFORE(741,3) KCACHE0(CB5, _0_15), + JUMP(0,90) VALID_PIX, + ALU(744,5), + ELSE(1, 97) VALID_PIX, + ALU_PUSH_BEFORE(749,3) KCACHE0(CB5, _0_15), + JUMP(0,94) VALID_PIX, + ALU(752,5), + ELSE(1, 96) VALID_PIX, + ALU_POP_AFTER(757,13) KCACHE0(CB5, _0_15), + POP(1, 97), + ELSE(1, 99) VALID_PIX, + ALU_POP_AFTER(770,4), + POP(2, 100), + POP(1, 101), + ELSE(1, 130) VALID_PIX, + ALU_PUSH_BEFORE(774,2) KCACHE0(CB5, _0_15), + JUMP(0,105) VALID_PIX, + ALU(776,3), + ELSE(0,129) VALID_PIX, + ALU_PUSH_BEFORE(779,2) KCACHE0(CB5, _0_15), + JUMP(0,109) VALID_PIX, + ALU(781,3), + ELSE(1, 129) VALID_PIX, + ALU_PUSH_BEFORE(784,3) KCACHE0(CB5, _0_15), + JUMP(0,113) VALID_PIX, + ALU(787,7) KCACHE0(CB1, _16_31), + ELSE(0,128) VALID_PIX, + ALU_PUSH_BEFORE(794,3) KCACHE0(CB5, _0_15), + JUMP(0,117) VALID_PIX, + ALU(797,5), + ELSE(1, 128) VALID_PIX, + ALU_PUSH_BEFORE(802,3) KCACHE0(CB5, _0_15), + JUMP(0,121) VALID_PIX, + ALU(805,5), + ELSE(0,127) VALID_PIX, + ALU_PUSH_BEFORE(810,3) KCACHE0(CB5, _0_15), + JUMP(0,125) VALID_PIX, + ALU(813,5), + ELSE(1, 127) VALID_PIX, + ALU_POP_AFTER(818,13) KCACHE0(CB5, _0_15), + POP(2, 128), + POP(2, 129), + POP(2, 130), + POP(1, 131), + ALU_PUSH_BEFORE(831,11) KCACHE0(CB5, _0_15), + JUMP(1, 190) VALID_PIX, + ALU_PUSH_BEFORE(842,1) KCACHE0(CB5, _0_15), + JUMP(0,148) VALID_PIX, + ALU_PUSH_BEFORE(843,5) KCACHE0(CB5, _0_15), + JUMP(0,140) VALID_PIX, + ALU(848,2), + ALU_PUSH_BEFORE(850,2), + POP(1, 140), + ELSE(1, 148) VALID_PIX, + ALU(852,2) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(854,1), + ELSE(1, 147) VALID_PIX, + ALU(855,3), + ALU_PUSH_BEFORE(858,1), + POP(2, 147), + POP(1, 148), + ELSE(0,189) VALID_PIX, + ALU_PUSH_BEFORE(859,3) KCACHE0(CB5, _0_15), + ALU(862,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(868,1) KCACHE0(CB1, _16_31), + POP(1, 153), + ELSE(1, 189) VALID_PIX, + ALU_PUSH_BEFORE(869,3) KCACHE0(CB5, _0_15), + ALU(872,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(878,1) KCACHE0(CB1, _16_31), + POP(1, 158), + ELSE(0,188) VALID_PIX, + ALU_PUSH_BEFORE(879,3) KCACHE0(CB5, _0_15), + ALU(882,7) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(889,1), + POP(1, 163), + ELSE(1, 188) VALID_PIX, + ALU_PUSH_BEFORE(890,3) KCACHE0(CB5, _0_15), + ALU(893,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(899,1) KCACHE0(CB1, _16_31), + POP(1, 168), + ELSE(0,187) VALID_PIX, + ALU_PUSH_BEFORE(900,3) KCACHE0(CB5, _0_15), + ALU(903,7) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(910,1), + POP(1, 173), + ELSE(1, 187) VALID_PIX, + ALU_PUSH_BEFORE(911,3) KCACHE0(CB5, _0_15), + ALU(914,6) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(920,1) KCACHE0(CB1, _16_31), + POP(1, 178), + ELSE(0,186) VALID_PIX, + ALU(921,1), + ALU_PUSH_BEFORE(922,1), + ELSE(1, 186) VALID_PIX, + ALU(923,2) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(925,1), + ELSE(0,185) VALID_PIX, + POP(2, 186), + POP(2, 187), + POP(2, 188), + POP(2, 189), + POP(2, 190), + ALU_PUSH_BEFORE(926,1) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(927,1) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(928,2) KCACHE0(CB5, _0_15), + ALU(930,3), + ALU_PUSH_BEFORE(933,1), + POP(1, 196), + ELSE(1, 204) VALID_PIX, + ALU(934,1), + ALU_PUSH_BEFORE(935,1), + ALU(936,2), + ALU_PUSH_BEFORE(938,2), + POP(1, 202), + ELSE(0,203) VALID_PIX, + POP(2, 204), + ELSE(0,217) VALID_PIX, + ALU_PUSH_BEFORE(940,2) KCACHE0(CB5, _0_15), + ALU(942,19) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(961,1), + POP(1, 209), + ELSE(1, 217) VALID_PIX, + ALU(962,1), + ALU_PUSH_BEFORE(963,1), + ALU(964,19) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(983,1), + POP(1, 215), + ELSE(0,216) VALID_PIX, + POP(2, 217), + POP(2, 218), + ALU_PUSH_BEFORE(984,4) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(988,3), + ALU_POP_AFTER(991,10) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1001,1) KCACHE0(CB5, _0_15), + ALU_POP_AFTER(1002,13) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(1015,5) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1020,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1023,3), + ALU_PUSH_BEFORE(1026,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1029,4), + ALU_PUSH_BEFORE(1033,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1036,4), + ALU_POP2_AFTER(1040,12) KCACHE0(CB5, _0_15) KCACHE1(CB1, _16_31), + POP(1, 232), + ALU_POP_AFTER(1052,3), + ALU_PUSH_BEFORE(1055,3) KCACHE0(CB5, _0_15), + ALU(1058,3), + TEX(1386,1) VALID_PIX, + ALU_PUSH_BEFORE(1061,2) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1063,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1065,3), + ALU_PUSH_BEFORE(1068,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1071,3), + ALU_PUSH_BEFORE(1074,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1077,3), + ALU_PUSH_BEFORE(1080,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1083,3), + ALU_PUSH_BEFORE(1086,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1089,3), + ALU_PUSH_BEFORE(1092,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1095,4), + ALU_PUSH_BEFORE(1099,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1102,5), + ALU_PUSH_BEFORE(1107,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1110,4), + ALU_POP2_AFTER(1114,12) KCACHE0(CB5, _0_15) KCACHE1(CB1, _16_31), + POP(6, 255), + ELSE(1, 257) VALID_PIX, + ALU_POP_AFTER(1126,3), + ALU_PUSH_BEFORE(1129,4) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1133,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1135,3), + ALU_PUSH_BEFORE(1138,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1141,3), + ALU_PUSH_BEFORE(1144,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1147,3), + ALU_PUSH_BEFORE(1150,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1153,3), + ALU_PUSH_BEFORE(1156,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1159,3), + ALU_PUSH_BEFORE(1162,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1165,4), + ALU_PUSH_BEFORE(1169,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1172,5), + ALU_PUSH_BEFORE(1177,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1180,4), + ALU_POP2_AFTER(1184,11) KCACHE0(CB5, _0_15) KCACHE1(CB1, _16_31), + POP(6, 276), + ELSE(1, 278) VALID_PIX, + ALU_POP_AFTER(1195,3), + ALU_PUSH_BEFORE(1198,5) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1203,3), + ALU_PUSH_BEFORE(1206,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1208,3), + ALU_PUSH_BEFORE(1211,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1214,3), + ALU_PUSH_BEFORE(1217,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1220,3), + ALU_POP2_AFTER(1223,20) KCACHE0(CB5, _0_15), + POP(3, 288), + ALU_POP_AFTER(1243,9) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1252,2) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1254,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1256,1) KCACHE0(CB1, _16_31), + ALU_PUSH_BEFORE(1257,1) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1258,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1261,2), + ALU_PUSH_BEFORE(1263,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1266,2), + ALU_PUSH_BEFORE(1268,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1271,2), + ALU_POP2_AFTER(1273,10) KCACHE0(CB5, _0_15), + POP(4, 301), + ALU_PUSH_BEFORE(1283,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1286,8), + ALU_PUSH_BEFORE(1294,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1296,5), + ALU(1301,1) KCACHE0(CB5, _0_15), + ALU_PUSH_BEFORE(1302,1) KCACHE0(CB5, _0_15), + ELSE(1, 309) VALID_PIX, + ALU_POP_AFTER(1303,4), + POP(2, 310), + ALU_PUSH_BEFORE(1307,2) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1309,4), + ALU_PUSH_BEFORE(1313,3) KCACHE0(CB5, _0_15), + ALU_ELSE_AFTER(1316,3), + ALU(1319,1) KCACHE0(CB5, _0_15), + POP(2, 316), + ALU_PUSH_BEFORE(1320,1) KCACHE0(CB5, _0_15), + ALU_POP_AFTER(1321,20) KCACHE0(CB5, _0_15), + ALU(1341,14), + EXP_DONE(PIX0, _R0,_x,_y,_z,_w) BURSTCNT(2) + END_OF_PROGRAM + }, + { + /* 0 */ + ALU_SETE_INT(_R3,_z, KC0(3),_z, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(_R0,_w, ALU_SRC_0,_x, KC0(3),_z), + ALU_MOV(_R2,_w, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 1 */ + ALU_SETNE_INT(_R3,_y, KC0(2),_w, ALU_SRC_1_INT,_x), + ALU_SETNE_INT(_R3,_w, KC0(2),_w, ALU_SRC_0,_x) + ALU_LAST, + /* 2 */ + ALU_NOT_INT(_R4,_w, KC0(0),_x) + ALU_LAST, + /* 3 */ + ALU_PRED_SETNE_INT(__,_x, _R4,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 4 */ + ALU_PRED_SETNE_INT(__,_x, KC0(0),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 5 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 6 */ + ALU_RECIP_IEEE(__,_x, _R4,_z) SCL_210 + ALU_LAST, + /* 7 */ + ALU_MUL_IEEE(__,_y, _R4,_y, ALU_SRC_PS,_x), + ALU_MUL_IEEE(__,_z, _R4,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 8 */ + ALU_CNDE_INT(_R4,_x, KC0(4),_y, _R4,_y, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R4,_w, KC0(4),_y, _R4,_x, ALU_SRC_PV,_z) + ALU_LAST, + /* 9 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 10 */ + ALU_MAX(__,_y, _R4,_w, KC0(12),_z), + ALU_ADD(__,_z, KC0(12),_x, KC0(12) _NEG,_z) + ALU_LAST, + /* 11 */ + ALU_MIN(_R6,_x, ALU_SRC_PV,_z, ALU_SRC_PV,_y) + ALU_LAST, + }, + { + /* 12 */ + ALU_ADD(__,_y, _R4,_w, KC0(12) _NEG,_x), + ALU_RECIP_IEEE(__,_y, KC0(12),_x) SCL_210 + ALU_LAST, + /* 13 */ + ALU_MUL_IEEE(__,_w, ALU_SRC_PV,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 14 */ + ALU_ADD(__,_z, ALU_SRC_PV,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 15 */ + ALU_FLOOR(__,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 16 */ + ALU_MULADD(_R6,_x, ALU_SRC_PV _NEG,_y, KC0(12),_x, _R4,_w) + ALU_LAST, + }, + { + /* 17 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 18 */ + ALU_MAX(__,_x, _R4,_x, KC0(12),_w), + ALU_ADD(__,_z, KC0(12),_y, KC0(12) _NEG,_w) + ALU_LAST, + /* 19 */ + ALU_MIN(_R6,_y, ALU_SRC_PV,_z, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 20 */ + ALU_ADD(__,_x, _R4,_x, KC0(12) _NEG,_y), + ALU_RECIP_IEEE(__,_x, KC0(12),_y) SCL_210 + ALU_LAST, + /* 21 */ + ALU_MUL_IEEE(__,_w, ALU_SRC_PV,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 22 */ + ALU_ADD(__,_z, ALU_SRC_PV,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 23 */ + ALU_FLOOR(__,_x, ALU_SRC_PV,_z) + ALU_LAST, + /* 24 */ + ALU_MULADD(_R6,_y, ALU_SRC_PV _NEG,_x, KC0(12),_y, _R4,_x) + ALU_LAST, + }, + { + /* 25 */ + ALU_PRED_SETNE_INT(__,_x, KC0(2),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 26 */ + ALU_ADD(_R6,_x, _R6,_x, KC0(13),_x), + ALU_ADD(_R6,_y, _R6,_y, KC0(13),_y) + ALU_LAST, + }, + { + /* 27 */ + ALU_MOV(_R6,_x, _R4,_x), + ALU_MOV(_R6,_y, _R4,_y) + ALU_LAST, + }, + { + /* 28 */ + ALU_NOT_INT(__,_y, KC0(1),_y) + ALU_LAST, + /* 29 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_y, ALU_SRC_0,_x, ALU_SRC_M_1_INT,_x) + ALU_LAST, + /* 30 */ + ALU_CNDE_INT(_R4,_w, KC0(4),_y, ALU_SRC_0,_x, ALU_SRC_PV,_x) + ALU_LAST, + /* 31 */ + ALU_PRED_SETNE_INT(__,_x, _R4,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 32 */ + ALU_RECIP_IEEE(__,_x, _R4,_z) SCL_210 + ALU_LAST, + /* 33 */ + ALU_MUL(_R4,_x, _R6,_x, ALU_SRC_PS,_x), + ALU_MUL(_R4,_y, _R6,_y, ALU_SRC_PS,_x) + ALU_LAST, + }, + { + /* 34 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 35 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 36 */ + ALU_PRED_SETNE_INT(__,_x, KC0(1),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 37 */ + ALU_LSHR_INT(_R4,_x, KC0(5),_y, ALU_SRC_LITERAL,_x), + ALU_LSHR_INT(_R4,_y, KC0(5),_y, ALU_SRC_LITERAL,_y), + ALU_MOV(_R4,_z, ALU_SRC_0,_x), + ALU_LSHR_INT(_R4,_w, KC0(5),_y, ALU_SRC_LITERAL,_z), + ALU_LSHR_INT(_R5,_z, KC0(5),_y, ALU_SRC_LITERAL,_w) + ALU_LAST, + ALU_LITERAL4(0x00000010, 0x00000008, 0x00000018, 0x0000001F), + }, + { + /* 38 */ + ALU_AND_INT(_R11,_x, KC0(5),_y, ALU_SRC_LITERAL,_x), + ALU_AND_INT(__,_y, _R4,_x, ALU_SRC_LITERAL,_x), + ALU_AND_INT(_R4,_z, _R4,_y, ALU_SRC_LITERAL,_x), + ALU_AND_INT(_R5,_w, _R4,_w, ALU_SRC_LITERAL,_y), + ALU_SETNE_INT(_R12,_x, ALU_SRC_0,_x, _R5,_z) + ALU_LAST, + ALU_LITERAL2(0x000000FF, 0x00000003), + /* 39 */ + ALU_SETNE_INT(_R5,_z, ALU_SRC_PV,_w, ALU_SRC_0,_x), + ALU_LSHL_INT(_R4,_w, ALU_SRC_PV,_y, ALU_SRC_LITERAL,_x), + ALU_INT_TO_FLT(__,_z, _R5,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 40 */ + ALU_MUL(__,_y, _R6,_x, ALU_SRC_PS,_x), + ALU_INT_TO_FLT(__,_y, _R5,_y) SCL_210 + ALU_LAST, + /* 41 */ + ALU_MUL(__,_x, _R6,_y, ALU_SRC_PS,_x), + ALU_FRACT(_R10,_x, ALU_SRC_PV,_y) + ALU_LAST, + /* 42 */ + ALU_FRACT(_R10,_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 43 */ + ALU_PRED_SETE_INT(__,_x, _R5,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 44 */ + ALU_MUL(__,_x, _R16,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R16,_x, ALU_SRC_LITERAL,_y), + ALU_MUL(_R127,_w, _R16,_z, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x427FF5C3, 0x41FFEB85), + /* 45 */ + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 46 */ + ALU_LSHL_INT(_R127,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 47 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(_R127,_z, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000B), + /* 48 */ + ALU_OR_INT(__,_y, _R127,_w, ALU_SRC_PV,_z) + ALU_LAST, + /* 49 */ + ALU_OR_INT(_R13,_x, _R127,_z, ALU_SRC_PV,_y) + ALU_LAST, + /* 50 */ + ALU_PRED_SETNE_INT(__,_x, _R12,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 51 */ + ALU_MUL(_R127,_x, _R8,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R8,_y, ALU_SRC_LITERAL,_y), + ALU_MUL(_R127,_z, _R9,_z, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(__,_w, _R9,_y, ALU_SRC_LITERAL,_y) VEC_120, + ALU_MUL(_R126,_x, _R7,_y, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x41FFEB85, 0x427FF5C3), + /* 52 */ + ALU_MUL(_R125,_x, _R9,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R126,_y, _R7,_x, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R126,_z, _R8,_x, ALU_SRC_LITERAL,_x) VEC_201, + ALU_MUL(_R127,_w, _R7,_z, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x41FFEB85), + /* 53 */ + ALU_LSHL_INT(_R127,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_z, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 54 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_y, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000B), + /* 55 */ + ALU_LSHL_INT(_R127,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R126,_w, _R127,_z, ALU_SRC_PV,_y), + ALU_FLT_TO_UINT(__,_x, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 56 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R126,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000B), + /* 57 */ + ALU_OR_INT(_R127,_z, _R127,_x, ALU_SRC_PV,_w), + ALU_LSHL_INT(_R127,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_z, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 58 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_z, _R125,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000B), + /* 59 */ + ALU_OR_INT(_R18,_x, ALU_SRC_PS,_x, _R126,_w), + ALU_OR_INT(_R127,_y, _R127,_w, ALU_SRC_PV,_z), + ALU_FLT_TO_UINT(__,_x, _R126,_z) SCL_210 + ALU_LAST, + /* 60 */ + ALU_OR_INT(_R15,_x, ALU_SRC_PS,_x, _R127,_z), + ALU_FLT_TO_UINT(__,_x, _R126,_y) SCL_210 + ALU_LAST, + /* 61 */ + ALU_OR_INT(_R14,_x, ALU_SRC_PS,_x, _R127,_y) + ALU_LAST, + }, + { + /* 62 */ + ALU_SETNE_INT(_R4,_y, _R5,_w, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 63 */ + ALU_PRED_SETE_INT(__,_x, _R4,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 64 */ + ALU_MUL(_R126,_x, _R16,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R16,_w, ALU_SRC_1,_x), + ALU_MUL(__,_z, _R16,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_w, _R16,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x41FFEB85), + /* 65 */ + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 66 */ + ALU_LSHL_INT(_R127,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000A), + /* 67 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000F), + /* 68 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(__,_z, _R127,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(_R127,_y, _R126,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 69 */ + ALU_OR_INT(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 70 */ + ALU_OR_INT(_R13,_x, _R127,_y, ALU_SRC_PV,_y) + ALU_LAST, + /* 71 */ + ALU_PRED_SETNE_INT(__,_x, _R12,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 72 */ + ALU_MUL(_R127,_x, _R9,_w, ALU_SRC_1,_x), + ALU_MUL(__,_y, _R9,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_z, _R8,_w, ALU_SRC_1,_x) VEC_120, + ALU_MUL(_R127,_w, _R8,_z, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R127,_y, _R7,_w, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x41FFEB85), + /* 73 */ + ALU_MUL(_R126,_x, _R9,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R125,_y, _R8,_y, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R126,_z, _R7,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R125,_w, _R7,_y, ALU_SRC_LITERAL,_x) VEC_201, + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x41FFEB85), + /* 74 */ + ALU_MUL(_R124,_x, _R9,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R124,_y, _R8,_x, ALU_SRC_LITERAL,_x) VEC_120, + ALU_LSHL_INT(_R126,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_y), + ALU_FLT_TO_UINT(__,_x, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL2(0x41FFEB85, 0x0000000A), + /* 75 */ + ALU_MUL(_R4,_x, _R7,_x, ALU_SRC_LITERAL,_x), + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_y), + ALU_FLT_TO_UINT(__,_x, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL2(0x41FFEB85, 0x0000000F), + /* 76 */ + ALU_OR_INT(_R125,_x, _R126,_w, ALU_SRC_PV,_z), + ALU_LSHL_INT(_R126,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000A), + /* 77 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R126,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000F), + /* 78 */ + ALU_LSHL_INT(_R127,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R126,_w, _R126,_y, ALU_SRC_PV,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000A), + /* 79 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R126,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000F), + /* 80 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R126,_z, _R127,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(__,_x, _R125,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 81 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R125,_w, ALU_SRC_PV,_x, _R125,_x), + ALU_FLT_TO_UINT(__,_z, _R125,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 82 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R127,_z, ALU_SRC_PV,_z, _R126,_w), + ALU_FLT_TO_UINT(__,_y, _R124,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 83 */ + ALU_OR_INT(_R18,_x, ALU_SRC_PS,_x, _R125,_w), + ALU_OR_INT(_R124,_y, ALU_SRC_PV,_y, _R126,_z), + ALU_FLT_TO_UINT(__,_x, _R124,_y) SCL_210 + ALU_LAST, + /* 84 */ + ALU_OR_INT(_R15,_x, ALU_SRC_PS,_x, _R127,_z), + ALU_FLT_TO_UINT(__,_x, _R4,_x) SCL_210 + ALU_LAST, + /* 85 */ + ALU_OR_INT(_R14,_x, ALU_SRC_PS,_x, _R124,_y) + ALU_LAST, + }, + { + /* 86 */ + ALU_SETNE_INT(_R4,_x, _R5,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 87 */ + ALU_PRED_SETE_INT(__,_x, _R4,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 88 */ + ALU_MUL(_R127,_x, _R16,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R16,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_z, _R16,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(__,_w, _R16,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x417FD70A), + /* 89 */ + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 90 */ + ALU_LSHL_INT(_R126,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 91 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000C), + /* 92 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(__,_z, _R126,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(_R127,_y, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 93 */ + ALU_OR_INT(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 94 */ + ALU_OR_INT(_R13,_x, _R127,_y, ALU_SRC_PV,_y) + ALU_LAST, + /* 95 */ + ALU_PRED_SETNE_INT(__,_x, _R12,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 96 */ + ALU_MUL(_R127,_x, _R8,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R9,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(__,_z, _R9,_z, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R127,_w, _R8,_w, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R125,_w, _R7,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x417FD70A), + /* 97 */ + ALU_MUL(_R126,_x, _R7,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R126,_y, _R8,_y, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R127,_z, _R7,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(_R124,_w, _R9,_y, ALU_SRC_LITERAL,_x) VEC_201, + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x417FD70A), + /* 98 */ + ALU_MUL(_R124,_x, _R9,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R125,_y, _R7,_x, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R126,_z, _R8,_x, ALU_SRC_LITERAL,_x) VEC_201, + ALU_LSHL_INT(_R126,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_y), + ALU_FLT_TO_UINT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL2(0x417FD70A, 0x00000008), + /* 99 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_z, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000C), + /* 100 */ + ALU_OR_INT(_R125,_x, _R126,_w, ALU_SRC_PV,_z), + ALU_LSHL_INT(_R127,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 101 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R125,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000C), + /* 102 */ + ALU_LSHL_INT(_R127,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R125,_w, _R127,_y, ALU_SRC_PV,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 103 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R124,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000000C), + /* 104 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R127,_z, _R127,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(__,_x, _R126,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 105 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R124,_w, ALU_SRC_PV,_x, _R125,_x), + ALU_FLT_TO_UINT(__,_z, _R126,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 106 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R125,_z, ALU_SRC_PV,_z, _R125,_w), + ALU_FLT_TO_UINT(__,_y, _R124,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 107 */ + ALU_OR_INT(_R18,_x, ALU_SRC_PS,_x, _R124,_w), + ALU_OR_INT(_R126,_y, ALU_SRC_PV,_y, _R127,_z), + ALU_FLT_TO_UINT(__,_x, _R126,_z) SCL_210 + ALU_LAST, + /* 108 */ + ALU_OR_INT(_R15,_x, ALU_SRC_PS,_x, _R125,_z), + ALU_FLT_TO_UINT(__,_x, _R125,_y) SCL_210 + ALU_LAST, + /* 109 */ + ALU_OR_INT(_R14,_x, ALU_SRC_PS,_x, _R126,_y) + ALU_LAST, + }, + { + /* 110 */ + ALU_SETNE_INT(_R5,_w, _R5,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 111 */ + ALU_PRED_SETE_INT(__,_x, _R5,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 112 */ + ALU_MUL(_R127,_x, _R16,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R16,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_z, _R16,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(__,_w, _R16,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x437FFD71), + /* 113 */ + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 114 */ + ALU_LSHL_INT(_R126,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000010), + /* 115 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000018), + /* 116 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(__,_z, _R126,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(_R127,_y, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 117 */ + ALU_OR_INT(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 118 */ + ALU_OR_INT(_R13,_x, _R127,_y, ALU_SRC_PV,_y) + ALU_LAST, + /* 119 */ + ALU_PRED_SETNE_INT(__,_x, _R12,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 120 */ + ALU_MUL(_R127,_x, _R8,_z, ALU_SRC_LITERAL,_x), + ALU_MUL(_R127,_y, _R9,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(__,_z, _R9,_z, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R127,_w, _R8,_w, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R125,_w, _R7,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x437FFD71), + /* 121 */ + ALU_MUL(_R126,_x, _R7,_y, ALU_SRC_LITERAL,_x), + ALU_MUL(_R126,_y, _R8,_y, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R127,_z, _R7,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(_R124,_w, _R9,_y, ALU_SRC_LITERAL,_x) VEC_201, + ALU_FLT_TO_UINT(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x437FFD71), + /* 122 */ + ALU_MUL(_R124,_x, _R9,_x, ALU_SRC_LITERAL,_x), + ALU_MUL(_R125,_y, _R7,_x, ALU_SRC_LITERAL,_x) VEC_120, + ALU_MUL(_R126,_z, _R8,_x, ALU_SRC_LITERAL,_x) VEC_201, + ALU_LSHL_INT(_R126,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_y), + ALU_FLT_TO_UINT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + ALU_LITERAL2(0x437FFD71, 0x00000010), + /* 123 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_z, _R127,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000018), + /* 124 */ + ALU_OR_INT(_R125,_x, _R126,_w, ALU_SRC_PV,_z), + ALU_LSHL_INT(_R127,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000010), + /* 125 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_x, _R125,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000018), + /* 126 */ + ALU_LSHL_INT(_R127,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R125,_w, _R127,_y, ALU_SRC_PV,_x), + ALU_FLT_TO_UINT(__,_x, _R127,_z) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000010), + /* 127 */ + ALU_LSHL_INT(__,_w, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_FLT_TO_UINT(__,_w, _R124,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000018), + /* 128 */ + ALU_LSHL_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R127,_z, _R127,_x, ALU_SRC_PV,_w), + ALU_FLT_TO_UINT(__,_x, _R126,_y) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 129 */ + ALU_LSHL_INT(__,_z, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R124,_w, ALU_SRC_PV,_x, _R125,_x), + ALU_FLT_TO_UINT(__,_z, _R126,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 130 */ + ALU_LSHL_INT(__,_y, ALU_SRC_PS,_x, ALU_SRC_LITERAL,_x), + ALU_OR_INT(_R125,_z, ALU_SRC_PV,_z, _R125,_w), + ALU_FLT_TO_UINT(__,_y, _R124,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 131 */ + ALU_OR_INT(_R18,_x, ALU_SRC_PS,_x, _R124,_w), + ALU_OR_INT(_R126,_y, ALU_SRC_PV,_y, _R127,_z), + ALU_FLT_TO_UINT(__,_x, _R126,_z) SCL_210 + ALU_LAST, + /* 132 */ + ALU_OR_INT(_R15,_x, ALU_SRC_PS,_x, _R125,_z), + ALU_FLT_TO_UINT(__,_x, _R125,_y) SCL_210 + ALU_LAST, + /* 133 */ + ALU_OR_INT(_R14,_x, ALU_SRC_PS,_x, _R126,_y) + ALU_LAST, + }, + { + /* 134 */ + ALU_MOV(_R4,_y, ALU_SRC_0,_x), + ALU_LSHR_INT(__,_w, _R13,_x, _R4,_z) + ALU_LAST, + /* 135 */ + ALU_AND_INT(__,_z, _R11,_x, ALU_SRC_PV,_w) + ALU_LAST, + /* 136 */ + ALU_OR_INT(_R4,_x, _R4,_w, ALU_SRC_PV,_z) + ALU_LAST, + }, + { + /* 137 */ + ALU_PRED_SETNE_INT(__,_x, _R12,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 138 */ + ALU_LSHR_INT(__,_x, _R14,_x, _R4,_z), + ALU_LSHR_INT(__,_y, _R18,_x, _R4,_z) VEC_120, + ALU_MOV(_R4,_z, ALU_SRC_0,_x), + ALU_LSHR_INT(__,_w, _R15,_x, _R4,_z) VEC_201, + ALU_MOV(_R4,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 139 */ + ALU_AND_INT(__,_x, _R11,_x, ALU_SRC_PV,_y), + ALU_AND_INT(__,_y, _R11,_x, ALU_SRC_PV,_x), + ALU_AND_INT(__,_z, _R11,_x, ALU_SRC_PV,_w), + ALU_MOV(_R5,_w, ALU_SRC_0,_x) + ALU_LAST, + /* 140 */ + ALU_OR_INT(_R4,_x, _R4,_w, ALU_SRC_PV,_x), + ALU_OR_INT(_R5,_y, _R4,_w, ALU_SRC_PV,_y), + ALU_OR_INT(_R4,_w, _R4,_w, ALU_SRC_PV,_z) + ALU_LAST, + }, + { + /* 141 */ + ALU_ADD(_R127,_x, _R16 _NEG,_w, _R6,_w), + ALU_ADD(_R127,_y, _R16 _NEG,_z, _R6,_z), + ALU_ADD(__,_z, _R16 _NEG,_y, _R6,_y), + ALU_ADD(_R127,_w, _R16 _NEG,_x, _R6,_x) + ALU_LAST, + /* 142 */ + ALU_ADD(_R126,_x, _R4 _NEG,_w, _R5,_w), + ALU_ADD(__,_y, _R4 _NEG,_z, _R5,_z), + ALU_ADD(__,_z, _R4 _NEG,_y, _R5,_y) VEC_120, + ALU_ADD(_R126,_w, _R4 _NEG,_x, _R5,_x) VEC_021, + ALU_MULADD(_R126,_z, ALU_SRC_PV,_z, _R10,_x, _R16,_y) + ALU_LAST, + /* 143 */ + ALU_MULADD(_R127,_x, _R127,_x, _R10,_x, _R16,_w), + ALU_MULADD(_R126,_y, _R127,_y, _R10,_x, _R16,_z), + ALU_MULADD(_R123,_z, ALU_SRC_PV,_z, _R10,_x, _R4,_y), + ALU_MULADD(_R127,_w, _R127,_w, _R10,_x, _R16,_x), + ALU_MULADD(_R122,_x, ALU_SRC_PV,_y, _R10,_x, _R4,_z) + ALU_LAST, + /* 144 */ + ALU_MULADD(_R123,_x, _R126,_x, _R10,_x, _R4,_w), + ALU_ADD(_R127,_y, ALU_SRC_PV _NEG,_y, ALU_SRC_PS,_x), + ALU_ADD(_R127,_z, _R126 _NEG,_z, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R126,_w, _R10,_x, _R4,_x) + ALU_LAST, + /* 145 */ + ALU_ADD(__,_x, _R127 _NEG,_x, ALU_SRC_PV,_x), + ALU_ADD(__,_w, _R127 _NEG,_w, ALU_SRC_PV,_w) + ALU_LAST, + /* 146 */ + ALU_MULADD(_R16,_x, ALU_SRC_PV,_w, _R10,_y, _R127,_w), + ALU_MULADD(_R16,_y, _R127,_z, _R10,_y, _R126,_z), + ALU_MULADD(_R16,_z, _R127,_y, _R10,_y, _R126,_y), + ALU_MULADD(_R16,_w, ALU_SRC_PV,_x, _R10,_y, _R127,_x) + ALU_LAST, + }, + { + /* 147 */ + ALU_ADD(_R5,_x, _R1,_x, _R16,_x), + ALU_ADD(_R5,_y, _R1,_y, _R16,_y), + ALU_ADD(_R4,_z, _R1,_z, _R16,_z), + ALU_MUL(_R5,_z, _R1,_w, _R16,_w) + ALU_LAST, + /* 148 */ + ALU_PRED_SETNE_INT(__,_x, KC0(0),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 149 */ + ALU_SETNE_INT(_R4,_y, KC0(0),_z, ALU_SRC_0,_x) + ALU_LAST, + /* 150 */ + ALU_PRED_SETE_INT(__,_x, _R4,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 151 */ + ALU_MUL(_R1,_x, _R1,_x, _R16,_x), + ALU_MUL(_R1,_y, _R1,_y, _R16,_y), + ALU_MUL(_R1,_z, _R1,_z, _R16,_z), + ALU_MUL(_R1,_w, _R1,_w, _R16,_w) + ALU_LAST, + }, + { + /* 152 */ + ALU_SETNE_INT(_R4,_x, KC0(0),_z, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 153 */ + ALU_PRED_SETE_INT(__,_x, _R4,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 154 */ + ALU_ADD(__,_x, _R1 _NEG,_y, _R16,_y), + ALU_ADD(__,_y, _R1 _NEG,_x, _R16,_x), + ALU_ADD(__,_w, _R1 _NEG,_z, _R16,_z) + ALU_LAST, + /* 155 */ + ALU_MULADD(_R1,_x, ALU_SRC_PV,_y, _R16,_w, _R1,_x), + ALU_MULADD(_R1,_y, ALU_SRC_PV,_x, _R16,_w, _R1,_y), + ALU_MULADD(_R1,_z, ALU_SRC_PV,_w, _R16,_w, _R1,_z) + ALU_LAST, + }, + { + /* 156 */ + ALU_SETNE_INT(_R4,_w, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 157 */ + ALU_PRED_SETE_INT(__,_x, _R4,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 158 */ + ALU_ADD(__,_x, _R1 _NEG,_z, KC0(7),_z), + ALU_ADD(__,_y, _R1 _NEG,_y, KC0(7),_y), + ALU_ADD(__,_z, _R1 _NEG,_x, KC0(7),_x) + ALU_LAST, + /* 159 */ + ALU_MULADD(_R1,_x, ALU_SRC_PV,_z, _R16,_x, _R1,_x), + ALU_MULADD(_R1,_y, ALU_SRC_PV,_y, _R16,_y, _R1,_y), + ALU_MULADD(_R1,_z, ALU_SRC_PV,_x, _R16,_z, _R1,_z), + ALU_MOV(_R1,_w, _R5,_z) + ALU_LAST, + }, + { + /* 160 */ + ALU_PRED_SETNE_INT(__,_x, KC0(0),_z, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000003), + }, + { + /* 161 */ + ALU_SETNE_INT(_R6,_z, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 162 */ + ALU_PRED_SETE_INT(__,_x, _R6,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 163 */ + ALU_MOV(_R1,_w, _R5,_z) + ALU_LAST, + /* 164 */ + ALU_MOV(_R1,_x, _R5,_x), + ALU_MOV(_R1,_y, _R5,_y), + ALU_MOV(_R1,_z, _R4,_z), + ALU_MOV(_R1,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 165 */ + ALU_SETNE_INT(_R4,_y, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 166 */ + ALU_PRED_SETE_INT(__,_x, _R4,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 167 */ + ALU_MOV(_R1,_w, _R5,_z) + ALU_LAST, + /* 168 */ + ALU_MOV(_R1,_x, _R5,_x), + ALU_MOV(_R1,_y, _R5,_y), + ALU_MOV(_R1,_z, _R4,_z), + ALU_MOV(_R1,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 169 */ + ALU_SETNE_INT(__,_x, KC0(0),_z, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(_R127,_y, KC0(0),_z, ALU_SRC_LITERAL,_y), + ALU_MOV(_R4,_w, _R5,_z), + ALU_MOV(_R5,_w, _R5,_z) + ALU_LAST, + ALU_LITERAL2(0x00000007, 0x00000006), + /* 170 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_w, _R1,_w), + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_x, _R4,_z, _R1,_z), + ALU_CNDE_INT(_R123,_z, ALU_SRC_PV,_x, _R5,_y, _R1,_y), + ALU_CNDE_INT(_R123,_w, ALU_SRC_PV,_x, _R5,_x, _R1,_x) + ALU_LAST, + /* 171 */ + ALU_CNDE_INT(_R1,_x, _R127,_y, _R5,_x, ALU_SRC_PV,_w), + ALU_CNDE_INT(_R1,_y, _R127,_y, _R5,_y, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R1,_z, _R127,_y, _R4,_z, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R1,_w, _R127,_y, _R5,_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 172 */ + ALU_MOV(_R1,_x, _R16,_x), + ALU_MOV(_R1,_y, _R16,_y), + ALU_MOV(_R1,_z, _R16,_z), + ALU_MOV(_R1,_w, _R16,_w) + ALU_LAST, + }, + { + /* 173 */ + ALU_SETNE_INT(_R5,_z, KC0(0),_z, ALU_SRC_0,_x) + ALU_LAST, + /* 174 */ + ALU_PRED_SETE_INT(__,_x, _R5,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 175 */ + ALU_MUL(_R1,_x, _R1,_x, _R16,_x), + ALU_MUL(_R1,_y, _R1,_y, _R16,_y), + ALU_MUL(_R1,_z, _R1,_z, _R16,_z) + ALU_LAST, + }, + { + /* 176 */ + ALU_SETNE_INT(_R4,_y, KC0(0),_z, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 177 */ + ALU_PRED_SETE_INT(__,_x, _R4,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 178 */ + ALU_MOV(_R1,_x, _R16,_x), + ALU_MOV(_R1,_y, _R16,_y), + ALU_MOV(_R1,_z, _R16,_z) + ALU_LAST, + }, + { + /* 179 */ + ALU_SETNE_INT(_R4,_x, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 180 */ + ALU_PRED_SETE_INT(__,_x, _R4,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 181 */ + ALU_ADD(__,_x, _R1 _NEG,_z, KC0(7),_z), + ALU_ADD(__,_y, _R1 _NEG,_y, KC0(7),_y), + ALU_ADD(__,_z, _R1 _NEG,_x, KC0(7),_x) + ALU_LAST, + /* 182 */ + ALU_MULADD(_R1,_x, ALU_SRC_PV,_z, _R16,_x, _R1,_x), + ALU_MULADD(_R1,_y, ALU_SRC_PV,_y, _R16,_y, _R1,_y), + ALU_MULADD(_R1,_z, ALU_SRC_PV,_x, _R16,_z, _R1,_z), + ALU_MOV(_R1,_w, _R1,_w) + ALU_LAST, + }, + { + /* 183 */ + ALU_SETNE_INT(_R4,_w, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 184 */ + ALU_PRED_SETE_INT(__,_x, _R4,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 185 */ + ALU_MOV(_R1,_w, _R1,_w) + ALU_LAST, + /* 186 */ + ALU_MOV(_R1,_x, _R16,_x), + ALU_MOV(_R1,_y, _R16,_y), + ALU_MOV(_R1,_z, _R16,_z), + ALU_MOV(_R1,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 187 */ + ALU_SETNE_INT(_R5,_z, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 188 */ + ALU_PRED_SETE_INT(__,_x, _R5,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 189 */ + ALU_MOV(_R1,_w, _R1,_w) + ALU_LAST, + /* 190 */ + ALU_MOV(_R1,_x, _R5,_x), + ALU_MOV(_R1,_y, _R5,_y), + ALU_MOV(_R1,_z, _R4,_z), + ALU_MOV(_R1,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 191 */ + ALU_SETNE_INT(_R4,_y, KC0(0),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 192 */ + ALU_PRED_SETE_INT(__,_x, _R4,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 193 */ + ALU_MOV(_R1,_w, _R1,_w) + ALU_LAST, + /* 194 */ + ALU_MOV(_R1,_x, _R5,_x), + ALU_MOV(_R1,_y, _R5,_y), + ALU_MOV(_R1,_z, _R4,_z), + ALU_MOV(_R1,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 195 */ + ALU_SETNE_INT(__,_x, KC0(0),_z, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(_R127,_y, KC0(0),_z, ALU_SRC_LITERAL,_y), + ALU_MOV(_R4,_w, _R1,_w), + ALU_MOV(_R5,_w, _R1,_w) + ALU_LAST, + ALU_LITERAL2(0x00000007, 0x00000006), + /* 196 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_w, _R1,_w), + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_x, _R4,_z, _R1,_z), + ALU_CNDE_INT(_R123,_z, ALU_SRC_PV,_x, _R5,_y, _R1,_y), + ALU_CNDE_INT(_R123,_w, ALU_SRC_PV,_x, _R5,_x, _R1,_x) + ALU_LAST, + /* 197 */ + ALU_CNDE_INT(_R1,_x, _R127,_y, _R5,_x, ALU_SRC_PV,_w), + ALU_CNDE_INT(_R1,_y, _R127,_y, _R5,_y, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R1,_z, _R127,_y, _R4,_z, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R1,_w, _R127,_y, _R5,_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 198 */ + ALU_ADD(_R2,_x, _R2,_x, _R1,_x), + ALU_ADD(_R2,_y, _R2,_y, _R1,_y), + ALU_ADD(_R2,_z, _R2,_z, _R1,_z), + ALU_ADD(_R2,_w, _R2,_w, _R1,_w) + ALU_LAST, + /* 199 */ + ALU_ADD(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_y) + ALU_LAST, + /* 200 */ + ALU_ADD(_R4,_y, _R2,_z, ALU_SRC_PV,_x) + ALU_LAST, + /* 201 */ + ALU_MULADD(_R4,_x, _R2,_x, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x), + ALU_MULADD(_R5,_y, _R2,_y, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x), + ALU_MULADD(_R4,_z, _R2,_z, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 202 */ + ALU_PRED_SETNE_INT(__,_x, KC0(2),_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 203 */ + ALU_PRED_SETNE_INT(__,_x, KC0(3),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 204 */ + ALU_SETE_INT(__,_x, KC0(2),_w, ALU_SRC_LITERAL,_x), + ALU_SETE_INT(__,_y, KC0(2),_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000006, 0x00000003), + /* 205 */ + ALU_CNDE_INT(_R1,_z, ALU_SRC_PV,_y, ALU_SRC_PV,_x, ALU_SRC_M_1_INT,_x) + ALU_LAST, + /* 206 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 207 */ + ALU_KILLGT(__,_x, ALU_SRC_LITERAL,_x, _R2,_w) + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 208 */ + ALU_PRED_SETGT(__,_x, ALU_SRC_LITERAL,_x, _R2,_w) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 209 */ + ALU_KILLE_INT(__,_x, KC0(2),_w, ALU_SRC_0,_x), + ALU_SETE_INT(_R1,_y, KC0(2),_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 210 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 211 */ + ALU_KILLGT(__,_x, _R2,_w, ALU_SRC_LITERAL,_x), + ALU_SETGT_DX10(_R1,_x, _R2,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 212 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 213 */ + ALU_SETNE_INT(_R1,_w, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 214 */ + ALU_PRED_SETE_INT(__,_x, _R1,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 215 */ + ALU_MULADD(_R123,_z, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 216 */ + ALU_FLOOR(__,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 217 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + /* 218 */ + ALU_AND_INT(_R1,_w, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 219 */ + ALU_KILLNE_INT(__,_x, KC0(8),_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 220 */ + ALU_PRED_SETNE_INT(__,_x, KC0(8),_w, _R1,_w) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 221 */ + ALU_SETNE_INT(_R1,_z, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 222 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 223 */ + ALU_MULADD(_R123,_w, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 224 */ + ALU_FLOOR(__,_y, ALU_SRC_PV,_w) + ALU_LAST, + /* 225 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + /* 226 */ + ALU_AND_INT(_R1,_z, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 227 */ + ALU_KILLE_INT(__,_x, ALU_SRC_PV,_z, KC0(8),_w) + ALU_LAST, + }, + { + /* 228 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, KC0(8),_w) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 229 */ + ALU_SETNE_INT(_R1,_y, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 230 */ + ALU_PRED_SETE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 231 */ + ALU_MULADD(_R123,_x, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 232 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_x) + ALU_LAST, + /* 233 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 234 */ + ALU_AND_INT(__,_y, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 235 */ + ALU_KILLGE_INT(__,_x, ALU_SRC_PV,_y, KC0(8),_w), + ALU_SETGE_INT(_R1,_x, ALU_SRC_PV,_y, KC0(8),_w) + ALU_LAST, + }, + { + /* 236 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 237 */ + ALU_SETNE_INT(_R1,_w, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 238 */ + ALU_PRED_SETE_INT(__,_x, _R1,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 239 */ + ALU_MULADD(_R123,_z, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 240 */ + ALU_FLOOR(__,_y, ALU_SRC_PV,_z) + ALU_LAST, + /* 241 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + /* 242 */ + ALU_AND_INT(_R1,_w, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 243 */ + ALU_KILLGT_INT(__,_x, ALU_SRC_PV,_w, KC0(8),_w) + ALU_LAST, + }, + { + /* 244 */ + ALU_PRED_SETGT_INT(__,_x, _R1,_w, KC0(8),_w) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 245 */ + ALU_SETNE_INT(_R1,_z, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000006), + /* 246 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 247 */ + ALU_MULADD(_R123,_y, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 248 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_y) + ALU_LAST, + /* 249 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 250 */ + ALU_AND_INT(__,_x, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 251 */ + ALU_KILLGE_INT(__,_x, KC0(8),_w, ALU_SRC_PV,_x), + ALU_SETGE_INT(_R1,_y, KC0(8),_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 252 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 253 */ + ALU_SETNE_INT(_R1,_x, KC0(2),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000007), + /* 254 */ + ALU_PRED_SETE_INT(__,_x, _R1,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 255 */ + ALU_MULADD(_R123,_w, _R2,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0_5,_x) + ALU_LAST, + ALU_LITERAL(0x437F0000), + /* 256 */ + ALU_FLOOR(__,_z, ALU_SRC_PV,_w) + ALU_LAST, + /* 257 */ + ALU_FLT_TO_INT(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 258 */ + ALU_AND_INT(_R1,_x, ALU_SRC_PS,_x, KC0(9),_w) + ALU_LAST, + /* 259 */ + ALU_KILLGT_INT(__,_x, KC0(8),_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 260 */ + ALU_PRED_SETGT_INT(__,_x, KC0(8),_w, _R1,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 261 */ + ALU_KILLE_INT(__,_x, _R3,_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 262 */ + ALU_PRED_SETE_INT(__,_x, _R3,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 263 */ + ALU_KILLE_INT(__,_x, _R3,_y, ALU_SRC_0,_x), + ALU_KILLNE_INT(__,_y, KC0(2),_w, ALU_SRC_1_INT,_x) + ALU_LAST, + }, + { + /* 264 */ + ALU_PRED_SETE_INT(__,_x, _R3,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 265 */ + ALU_PRED_SETNE_INT(__,_x, KC0(3),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 266 */ + ALU_PRED_SETNE_INT(__,_x, KC0(3),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 267 */ + ALU_PRED_SETE_INT(__,_x, KC0(3),_z, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000003), + }, + { + /* 268 */ + ALU_KILLGT(__,_x, ALU_SRC_LITERAL,_x, _R4,_y), + ALU_SETGT_DX10(_R0,_w, ALU_SRC_LITERAL,_x, _R4,_y) + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 269 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 270 */ + ALU_KILLE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 271 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 272 */ + ALU_KILLGT(__,_x, _R4,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 273 */ + ALU_PRED_SETGT(__,_x, _R4,_y, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x3B03126F), + }, + { + /* 274 */ + ALU_PRED_SETE_INT(__,_x, KC0(3),_z, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000002), + }, + { + /* 275 */ + ALU_FLOOR(_R127,_x, _R4,_z), + ALU_FLOOR(_R127,_y, _R5,_y), + ALU_FLOOR(__,_z, _R4,_x), + ALU_AND_INT(_R127,_w, KC0(8),_x, KC0(9),_x), + ALU_AND_INT(_R126,_x, KC0(8),_y, KC0(9),_y) + ALU_LAST, + /* 276 */ + ALU_AND_INT(_R126,_w, KC0(8),_z, KC0(9),_z), + ALU_FLT_TO_INT(__,_w, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 277 */ + ALU_AND_INT(__,_x, KC0(9),_x, ALU_SRC_PS,_x), + ALU_FLT_TO_INT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + /* 278 */ + ALU_SETE_INT(_R127,_y, _R127,_w, ALU_SRC_PV,_x), + ALU_AND_INT(__,_w, KC0(9),_y, ALU_SRC_PS,_x), + ALU_FLT_TO_INT(__,_y, _R127,_x) SCL_210 + ALU_LAST, + /* 279 */ + ALU_SETE_INT(__,_x, _R126,_x, ALU_SRC_PV,_w), + ALU_AND_INT(__,_z, KC0(9),_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 280 */ + ALU_SETE_INT(__,_w, _R126,_w, ALU_SRC_PV,_z), + ALU_MULLO_INT(__,_w, _R127,_y, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 281 */ + ALU_MULLO_INT(__,_x, ALU_SRC_PS,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 282 */ + ALU_NOT_INT(_R1,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 283 */ + ALU_KILLNE_INT(__,_x, ALU_SRC_PV,_x, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 284 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 285 */ + ALU_KILLE_INT(__,_x, _R3,_z, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 286 */ + ALU_PRED_SETNE_INT(__,_x, _R3,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 287 */ + ALU_FLOOR(_R127,_x, _R5,_y), + ALU_FLOOR(_R127,_y, _R4,_z), + ALU_FLOOR(__,_z, _R4,_x), + ALU_AND_INT(_R127,_w, KC0(8),_x, KC0(9),_x), + ALU_AND_INT(_R126,_y, KC0(8),_y, KC0(9),_y) + ALU_LAST, + /* 288 */ + ALU_AND_INT(_R126,_w, KC0(8),_z, KC0(9),_z), + ALU_FLT_TO_INT(__,_w, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 289 */ + ALU_AND_INT(__,_y, KC0(9),_x, ALU_SRC_PS,_x), + ALU_FLT_TO_INT(__,_y, _R127,_x) SCL_210 + ALU_LAST, + /* 290 */ + ALU_SETNE_INT(_R127,_x, ALU_SRC_PV,_y, _R127,_w), + ALU_AND_INT(__,_w, KC0(9),_y, ALU_SRC_PS,_x), + ALU_FLT_TO_INT(__,_x, _R127,_y) SCL_210 + ALU_LAST, + /* 291 */ + ALU_SETNE_INT(__,_y, ALU_SRC_PV,_w, _R126,_y), + ALU_AND_INT(__,_z, KC0(9),_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 292 */ + ALU_ADD_INT(__,_z, _R127,_x, ALU_SRC_PV,_y), + ALU_SETNE_INT(__,_w, ALU_SRC_PV,_z, _R126,_w) + ALU_LAST, + /* 293 */ + ALU_ADD_INT(__,_x, ALU_SRC_PV,_w, ALU_SRC_PV,_z) + ALU_LAST, + /* 294 */ + ALU_NOT_INT(_R1,_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 295 */ + ALU_KILLNE_INT(__,_x, ALU_SRC_PV,_y, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 296 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 297 */ + ALU_SETE_INT(__,_x, KC0(5),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 298 */ + ALU_CNDE_INT(_R0,_w, KC0(4),_z, ALU_SRC_0,_x, ALU_SRC_PV,_x) + ALU_LAST, + /* 299 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 300 */ + ALU_MOV_x4(_R2,_x, _R2,_x), + ALU_MOV_x4(_R2,_y, _R2,_y), + ALU_MOV_x4(_R2,_z, _R2,_z) + ALU_LAST, + }, + { + /* 301 */ + ALU_SETE_INT(__,_x, KC0(5),_z, ALU_SRC_LITERAL,_x), + ALU_MOV_x2(_R1,_y, _R2,_y), + ALU_MOV_x2(_R1,_z, _R2,_z), + ALU_MOV_x2(_R1,_x, _R2,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 302 */ + ALU_CNDE_INT(_R123,_w, KC0(4),_z, ALU_SRC_PV,_x, ALU_SRC_M_1_INT,_x) + ALU_LAST, + /* 303 */ + ALU_CNDE_INT(_R2,_x, ALU_SRC_PV,_w, _R2,_x, _R1,_x), + ALU_CNDE_INT(_R2,_y, ALU_SRC_PV,_w, _R2,_y, _R1,_y), + ALU_CNDE_INT(_R2,_z, ALU_SRC_PV,_w, _R2,_z, _R1,_z), + ALU_CNDE_INT(_R2,_w, ALU_SRC_PV,_w, _R2,_w, _R2,_w) + ALU_LAST, + }, + { + /* 304 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 305 */ + ALU_MOV(_R126,_x, _R2,_w), + ALU_MOV(_R127,_y, KC0(6),_z), + ALU_MOV(_R127,_z, KC0(6),_y), + ALU_MOV(_R127,_w, KC0(6),_x), + ALU_MOV(_R127,_x, _R3,_x) CLAMP + ALU_LAST, + /* 306 */ + ALU_ADD(__,_x, _R2,_w, ALU_SRC_PV _NEG,_x), + ALU_ADD(__,_y, _R2,_z, ALU_SRC_PV _NEG,_y), + ALU_ADD(__,_z, _R2,_y, ALU_SRC_PV _NEG,_z), + ALU_ADD(__,_w, _R2,_x, ALU_SRC_PV _NEG,_w) + ALU_LAST, + /* 307 */ + ALU_MULADD(_R2,_x, ALU_SRC_PV,_w, _R127,_x, _R127,_w), + ALU_MULADD(_R2,_y, ALU_SRC_PV,_z, _R127,_x, _R127,_z), + ALU_MULADD(_R2,_z, ALU_SRC_PV,_y, _R127,_x, _R127,_y), + ALU_MULADD(_R2,_w, ALU_SRC_PV,_x, _R127,_x, _R126,_x) + ALU_LAST, + }, + { + /* 308 */ + ALU_SETE_INT(__,_x, KC0(5),_z, ALU_SRC_LITERAL,_x), + ALU_SETE_INT(__,_z, KC0(5),_z, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000003, 0x00000002), + /* 309 */ + ALU_CNDE_INT(_R1,_y, ALU_SRC_PV,_z, ALU_SRC_PV,_x, ALU_SRC_M_1_INT,_x) + ALU_LAST, + /* 310 */ + ALU_PRED_SETNE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 311 */ + ALU_SETNE_INT(_R1,_x, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 312 */ + ALU_PRED_SETE_INT(__,_x, _R1,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 313 */ + ALU_MOV(_R1,_x, _R2,_w), + ALU_MOV(_R1,_y, _R2,_w), + ALU_MOV(_R1,_z, _R2,_w) + ALU_LAST, + }, + { + /* 314 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 315 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 316 */ + ALU_ADD(__,_w, _R2 _NEG,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 317 */ + ALU_MOV(_R1,_x, ALU_SRC_PV,_w), + ALU_MOV(_R1,_y, ALU_SRC_PV,_w), + ALU_MOV(_R1,_z, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 318 */ + ALU_SETNE_INT(_R1,_y, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000006), + /* 319 */ + ALU_PRED_SETE_INT(__,_x, _R1,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 320 */ + ALU_MOV_x2(_R1,_x, _R2,_w) + ALU_LAST, + /* 321 */ + ALU_MOV(_R1,_x, ALU_SRC_PV,_x), + ALU_MOV(_R1,_y, ALU_SRC_PV,_x), + ALU_MOV(_R1,_z, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 322 */ + ALU_SETNE_INT(_R127,_x, KC0(6),_x, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(__,_z, KC0(6),_x, ALU_SRC_LITERAL,_y), + ALU_MULADD(_R127,_w, _R2 _NEG,_w, ALU_SRC_LITERAL,_z, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL3(0x00000007, 0x0000000A, 0x40000000), + /* 323 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_z, KC1(10),_y, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_z, KC1(10),_x, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_w, ALU_SRC_PV,_z, KC1(10),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 324 */ + ALU_CNDE_INT(_R1,_x, _R127,_x, _R127,_w, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R1,_y, _R127,_x, _R127,_w, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R1,_z, _R127,_x, _R127,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 325 */ + ALU_MUL(_R2,_x, _R2,_x, _R1,_x), + ALU_MUL(_R2,_y, _R2,_y, _R1,_y), + ALU_MUL(_R2,_z, _R2,_z, _R1,_z) + ALU_LAST, + }, + { + /* 326 */ + ALU_ADD(_R3,_x, _R2 _NEG,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 327 */ + ALU_PRED_SETE_INT(__,_x, KC0(5),_z, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000006), + }, + { + /* 328 */ + ALU_MOV(_R1,_w, ALU_SRC_0,_x), + ALU_FLT_TO_INT(_R1,_x, _R0,_x) SCL_210 + ALU_LAST, + /* 329 */ + ALU_FLT_TO_INT(_R1,_y, _R0,_y) SCL_210 + ALU_LAST, + }, + { + /* 330 */ + ALU_ADD(_R1,_w, _R4 _NEG,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 331 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 332 */ + ALU_SETNE_INT(_R1,_z, KC0(6),_x, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 333 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 334 */ + ALU_ADD(_R1,_x, _R4 _NEG,_x, ALU_SRC_1,_x), + ALU_ADD(_R1,_y, _R4 _NEG,_y, ALU_SRC_1,_x), + ALU_ADD(_R1,_z, _R4 _NEG,_z, ALU_SRC_1,_x) + ALU_LAST, + }, + { + /* 335 */ + ALU_SETNE_INT(_R0,_y, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 336 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 337 */ + ALU_MOV(_R1,_x, _R2,_w), + ALU_MOV(_R1,_y, _R2,_w), + ALU_MOV(_R1,_z, _R2,_w) + ALU_LAST, + }, + { + /* 338 */ + ALU_SETNE_INT(_R0,_x, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 339 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 340 */ + ALU_MOV(_R1,_x, _R3,_x), + ALU_MOV(_R1,_y, _R3,_x), + ALU_MOV(_R1,_z, _R3,_x) + ALU_LAST, + }, + { + /* 341 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 342 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 343 */ + ALU_MOV(_R1,_x, _R4,_w), + ALU_MOV(_R1,_y, _R4,_w), + ALU_MOV(_R1,_z, _R4,_w) + ALU_LAST, + }, + { + /* 344 */ + ALU_SETNE_INT(_R1,_z, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 345 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 346 */ + ALU_MOV(_R1,_x, _R1,_w), + ALU_MOV(_R1,_y, _R1,_w), + ALU_MOV(_R1,_z, _R1,_w) + ALU_LAST, + }, + { + /* 347 */ + ALU_SETNE_INT(_R0,_y, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000006), + /* 348 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 349 */ + ALU_MOV_x2(_R0,_x, _R2,_w) + ALU_LAST, + /* 350 */ + ALU_MOV(_R1,_x, ALU_SRC_PV,_x), + ALU_MOV(_R1,_y, ALU_SRC_PV,_x), + ALU_MOV(_R1,_z, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 351 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000007), + /* 352 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 353 */ + ALU_MULADD(_R123,_w, _R2 _NEG,_w, ALU_SRC_LITERAL,_x, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x40000000), + /* 354 */ + ALU_MOV(_R1,_x, ALU_SRC_PV,_w), + ALU_MOV(_R1,_y, ALU_SRC_PV,_w), + ALU_MOV(_R1,_z, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 355 */ + ALU_SETNE_INT(_R0,_y, KC0(6),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 356 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 357 */ + ALU_MOV_x2(_R0,_x, _R4,_w) + ALU_LAST, + /* 358 */ + ALU_MOV(_R1,_x, ALU_SRC_PV,_x), + ALU_MOV(_R1,_y, ALU_SRC_PV,_x), + ALU_MOV(_R1,_z, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 359 */ + ALU_SETNE_INT(_R127,_x, KC0(6),_x, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(__,_z, KC0(6),_x, ALU_SRC_LITERAL,_y), + ALU_MULADD(_R127,_w, _R4 _NEG,_w, ALU_SRC_LITERAL,_z, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL3(0x00000009, 0x0000000A, 0x40000000), + /* 360 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_z, KC1(10),_y, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_z, KC1(10),_x, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_w, ALU_SRC_PV,_z, KC1(10),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 361 */ + ALU_CNDE_INT(_R1,_x, _R127,_x, _R127,_w, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R1,_y, _R127,_x, _R127,_w, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R1,_z, _R127,_x, _R127,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 362 */ + ALU_MOV(_R1,_x, _R4,_x), + ALU_MOV(_R1,_y, _R4,_y), + ALU_MOV(_R1,_z, _R4,_z) + ALU_LAST, + }, + { + /* 363 */ + ALU_MUL(_R1,_x, _R2,_x, _R1,_x), + ALU_MUL(_R1,_y, _R2,_y, _R1,_y), + ALU_MUL(_R5,_z, _R2,_z, _R1,_z) + ALU_LAST, + /* 364 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 365 */ + ALU_SETNE_INT(_R0,_x, KC0(6),_y, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 366 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 367 */ + ALU_ADD(_R3,_x, _R2 _NEG,_x, ALU_SRC_1,_x), + ALU_ADD(_R3,_y, _R2 _NEG,_y, ALU_SRC_1,_x), + ALU_ADD(_R3,_z, _R2 _NEG,_z, ALU_SRC_1,_x) + ALU_LAST, + }, + { + /* 368 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 369 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 370 */ + ALU_MOV(_R3,_x, _R2,_w), + ALU_MOV(_R3,_y, _R2,_w), + ALU_MOV(_R3,_z, _R2,_w) + ALU_LAST, + }, + { + /* 371 */ + ALU_SETNE_INT(_R1,_z, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 372 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 373 */ + ALU_MOV(_R3,_x, _R3,_x), + ALU_MOV(_R3,_y, _R3,_x), + ALU_MOV(_R3,_z, _R3,_x) + ALU_LAST, + }, + { + /* 374 */ + ALU_SETNE_INT(_R0,_y, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 375 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 376 */ + ALU_MOV(_R3,_x, _R4,_w), + ALU_MOV(_R3,_y, _R4,_w), + ALU_MOV(_R3,_z, _R4,_w) + ALU_LAST, + }, + { + /* 377 */ + ALU_SETNE_INT(_R0,_x, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 378 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 379 */ + ALU_MOV(_R3,_x, _R1,_w), + ALU_MOV(_R3,_y, _R1,_w), + ALU_MOV(_R3,_z, _R1,_w) + ALU_LAST, + }, + { + /* 380 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000006), + /* 381 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 382 */ + ALU_MOV_x2(__,_w, _R2,_w) + ALU_LAST, + /* 383 */ + ALU_MOV(_R3,_x, ALU_SRC_PV,_w), + ALU_MOV(_R3,_y, ALU_SRC_PV,_w), + ALU_MOV(_R3,_z, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 384 */ + ALU_SETNE_INT(_R0,_y, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000007), + /* 385 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 386 */ + ALU_MULADD(_R0,_x, _R2 _NEG,_w, ALU_SRC_LITERAL,_x, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x40000000), + /* 387 */ + ALU_MOV(_R3,_x, ALU_SRC_PV,_x), + ALU_MOV(_R3,_y, ALU_SRC_PV,_x), + ALU_MOV(_R3,_z, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 388 */ + ALU_SETNE_INT(_R0,_w, KC0(6),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000008), + /* 389 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 390 */ + ALU_MOV_x2(__,_w, _R4,_w) + ALU_LAST, + /* 391 */ + ALU_MOV(_R3,_x, ALU_SRC_PV,_w), + ALU_MOV(_R3,_y, ALU_SRC_PV,_w), + ALU_MOV(_R3,_z, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 392 */ + ALU_SETNE_INT(_R127,_x, KC0(6),_y, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(__,_z, KC0(6),_y, ALU_SRC_LITERAL,_y), + ALU_MULADD(_R127,_w, _R4 _NEG,_w, ALU_SRC_LITERAL,_z, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL3(0x00000009, 0x0000000A, 0x40000000), + /* 393 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_z, KC1(11),_y, ALU_SRC_0,_x), + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_z, KC1(11),_x, ALU_SRC_0,_x), + ALU_CNDE_INT(_R123,_w, ALU_SRC_PV,_z, KC1(11),_z, ALU_SRC_0,_x) + ALU_LAST, + /* 394 */ + ALU_CNDE_INT(_R3,_x, _R127,_x, _R127,_w, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R3,_y, _R127,_x, _R127,_w, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R3,_z, _R127,_x, _R127,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 395 */ + ALU_MOV(_R3,_x, _R2,_x), + ALU_MOV(_R3,_y, _R2,_y), + ALU_MOV(_R3,_z, _R2,_z) + ALU_LAST, + }, + { + /* 396 */ + ALU_MUL(_R0,_x, _R4,_x, _R3,_x), + ALU_MUL(_R0,_y, _R4,_y, _R3,_y), + ALU_MUL(_R1,_z, _R4,_z, _R3,_z) + ALU_LAST, + /* 397 */ + ALU_SETNE_INT(_R3,_z, KC0(5),_w, ALU_SRC_0,_x) + ALU_LAST, + /* 398 */ + ALU_PRED_SETE_INT(__,_x, _R3,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 399 */ + ALU_ADD(_R2,_x, _R0,_x, _R1,_x), + ALU_ADD(_R2,_y, _R0,_y, _R1,_y), + ALU_ADD(_R2,_z, _R1,_z, _R5,_z) + ALU_LAST, + }, + { + /* 400 */ + ALU_SETNE_INT(_R3,_y, KC0(5),_w, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 401 */ + ALU_PRED_SETE_INT(__,_x, _R3,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 402 */ + ALU_ADD(_R2,_x, _R0 _NEG,_x, _R1,_x), + ALU_ADD(_R2,_y, _R0 _NEG,_y, _R1,_y), + ALU_ADD(_R2,_z, _R1 _NEG,_z, _R5,_z) + ALU_LAST, + }, + { + /* 403 */ + ALU_SETNE_INT(_R3,_x, KC0(5),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 404 */ + ALU_PRED_SETE_INT(__,_x, _R3,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 405 */ + ALU_ADD(_R2,_x, _R0,_x, _R1 _NEG,_x), + ALU_ADD(_R2,_y, _R0,_y, _R1 _NEG,_y), + ALU_ADD(_R2,_z, _R1,_z, _R5 _NEG,_z) + ALU_LAST, + }, + { + /* 406 */ + ALU_SETNE_INT(_R0,_w, KC0(5),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 407 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 408 */ + ALU_MIN(_R2,_x, _R2,_x, _R4,_x), + ALU_MIN(_R2,_y, _R2,_y, _R4,_y), + ALU_MIN(_R2,_z, _R2,_z, _R4,_z) + ALU_LAST, + }, + { + /* 409 */ + ALU_ADD(__,_x, _R2,_y, _R4 _NEG,_y), + ALU_ADD(__,_y, _R2,_x, _R4 _NEG,_x), + ALU_SETNE_INT(_R127,_z, KC0(5),_w, ALU_SRC_LITERAL,_x), + ALU_ADD(__,_w, _R2,_z, _R4 _NEG,_z), + ALU_SETNE_INT(_R127,_w, KC0(5),_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000005, 0x00000004), + /* 410 */ + ALU_MAX_DX10(_R0,_x, ALU_SRC_PV,_y, ALU_SRC_PV _NEG,_y), + ALU_MAX_DX10(_R0,_y, ALU_SRC_PV,_x, ALU_SRC_PV _NEG,_x), + ALU_MAX_DX10(_R1,_z, ALU_SRC_PV,_w, ALU_SRC_PV _NEG,_w), + ALU_CNDE_INT(_R126,_w, ALU_SRC_PV,_z, _R2,_w, _R2,_w), + ALU_MAX(_R1,_x, _R2,_x, _R4,_x) + ALU_LAST, + /* 411 */ + ALU_CNDE_INT(_R123,_x, _R127,_z, ALU_SRC_PV,_y, _R2,_y) VEC_201, + ALU_MAX(_R0,_y, _R2,_y, _R4,_y) VEC_021, + ALU_MAX(_R1,_z, _R2,_z, _R4,_z), + ALU_CNDE_INT(_R123,_w, _R127,_z, ALU_SRC_PV,_x, _R2,_x) VEC_201, + ALU_CNDE_INT(_R122,_x, _R127,_z, ALU_SRC_PV,_z, _R2,_z) + ALU_LAST, + /* 412 */ + ALU_CNDE_INT(_R2,_x, _R127,_w, _R1,_x, ALU_SRC_PV,_w), + ALU_CNDE_INT(_R2,_y, _R127,_w, ALU_SRC_PV,_y, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R2,_z, _R127,_w, ALU_SRC_PV,_z, ALU_SRC_PS,_x), + ALU_CNDE_INT(_R2,_w, _R127,_w, _R2,_w, _R126,_w) + ALU_LAST, + }, + { + /* 413 */ + ALU_SETE_INT(__,_y, KC0(5),_z, ALU_SRC_LITERAL,_x), + ALU_SETE_INT(__,_z, KC0(5),_z, ALU_SRC_LITERAL,_y), + ALU_MOV_x2(_R0,_w, _R2,_w) + ALU_LAST, + ALU_LITERAL2(0x00000003, 0x00000004), + /* 414 */ + ALU_CNDE_INT(_R123,_x, ALU_SRC_PV,_z, ALU_SRC_PV,_y, ALU_SRC_M_1_INT,_x) + ALU_LAST, + /* 415 */ + ALU_CNDE_INT(_R1,_x, ALU_SRC_PV,_x, _R2,_x, _R2,_x), + ALU_CNDE_INT(_R1,_y, ALU_SRC_PV,_x, _R2,_y, _R2,_y), + ALU_CNDE_INT(_R1,_z, ALU_SRC_PV,_x, _R2,_z, _R2,_z), + ALU_CNDE_INT(_R1,_w, ALU_SRC_PV,_x, _R2,_w, _R0,_w) + ALU_LAST, + }, + { + /* 416 */ + ALU_MOV(_R0,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 417 */ + ALU_PRED_SETNE_INT(__,_x, ALU_SRC_0,_x, KC0(4),_w) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 418 */ + ALU_SETNE_INT(_R0,_x, KC0(5),_x, ALU_SRC_0,_x) + ALU_LAST, + /* 419 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 420 */ + ALU_MOV(_R0,_y, KC0(3),_z) + ALU_LAST, + }, + { + /* 421 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_x, ALU_SRC_1_INT,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 422 */ + ALU_SETNE_INT(_R2,_z, KC0(5),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 423 */ + ALU_PRED_SETE_INT(__,_x, _R2,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 424 */ + ALU_MOV(_R0,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 425 */ + ALU_SETNE_INT(_R0,_x, KC0(5),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 426 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 427 */ + ALU_MOV(_R0,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 428 */ + ALU_SETNE_INT(_R2,_z, KC0(5),_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000005), + /* 429 */ + ALU_PRED_SETE_INT(__,_x, _R2,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 430 */ + ALU_MOV(_R0,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3D888889), + }, + { + /* 431 */ + ALU_SETNE_INT(_R127,_y, KC0(5),_x, ALU_SRC_LITERAL,_x), + ALU_SETNE_INT(__,_w, KC0(5),_x, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000006, 0x00000008), + /* 432 */ + ALU_SETNE_INT(_R127,_x, KC0(5),_x, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_z, ALU_SRC_PV,_w, ALU_SRC_LITERAL,_y, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL2(0x00000007, 0x3B808081), + /* 433 */ + ALU_CNDE_INT(_R123,_x, _R127,_y, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_z) + ALU_LAST, + ALU_LITERAL(0x3B808081), + /* 434 */ + ALU_CNDE_INT(_R0,_y, _R127,_x, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_x) + ALU_LAST, + ALU_LITERAL(0x3D888889), + }, + { + /* 435 */ + ALU_SETNE_INT(_R0,_x, KC0(4),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 436 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 437 */ + ALU_MOV(_R0,_x, ALU_SRC_0,_x), + ALU_MOV(_R0,_w, _R0,_y) + ALU_LAST, + /* 438 */ + ALU_MOV(_R17,_x, _R1,_x), + ALU_MOV(_R17,_y, _R1,_y), + ALU_MOV(_R17,_z, _R1,_z), + ALU_MOV(_R17,_w, ALU_SRC_PV,_w) + ALU_LAST, + /* 439 */ + ALU_MOV(_R19,_x, _R0,_x), + ALU_MOV(_R19,_w, _R1,_w) + ALU_LAST, + }, + { + /* 440 */ + ALU_SETNE_INT(_R0,_w, KC0(4),_w, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 441 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 442 */ + ALU_MOV(_R0,_w, _R0,_y) + ALU_LAST, + /* 443 */ + ALU_MOV(_R17,_x, _R1,_x), + ALU_MOV(_R17,_y, _R1,_y), + ALU_MOV(_R17,_z, _R1,_z), + ALU_MOV(_R17,_w, ALU_SRC_PV,_w) + ALU_LAST, + }, + { + /* 444 */ + ALU_KILLNE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 445 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 446 */ + ALU_MOV(_R17,_x, _R1,_x), + ALU_MOV(_R17,_y, _R1,_y), + ALU_MOV(_R17,_z, _R1,_z), + ALU_MOV(_R17,_w, _R1,_w) + ALU_LAST, + }, + { + /* 447 */ + ALU_SETNE_INT(_R1,_z, KC0(5),_y, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 448 */ + ALU_PRED_SETE_INT(__,_x, _R1,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 449 */ + ALU_MOV(_R17,_x, ALU_SRC_LITERAL,_x), + ALU_MOV(_R17,_y, ALU_SRC_LITERAL,_x), + ALU_MOV(_R17,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 450 */ + ALU_SETNE_INT(_R0,_y, KC0(5),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 451 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 452 */ + ALU_ADD(_R17,_x, _R17 _NEG,_x, ALU_SRC_1,_x), + ALU_ADD(_R17,_y, _R17 _NEG,_y, ALU_SRC_1,_x), + ALU_ADD(_R17,_z, _R17 _NEG,_z, ALU_SRC_1,_x) + ALU_LAST, + }, + { + /* 453 */ + ALU_KILLNE_INT(__,_x, KC0(5),_y, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 454 */ + ALU_PRED_SETNE_INT(__,_x, KC0(7),_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 455 */ + ALU_NOT_INT(__,_z, KC0(7),_y) + ALU_LAST, + /* 456 */ + ALU_CNDE_INT(_R123,_y, ALU_SRC_PV,_z, ALU_SRC_LITERAL,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL2(0x487FFF00, 0x477FFF00), + /* 457 */ + ALU_CNDE_INT(_R123,_x, KC0(7),_x, ALU_SRC_PV,_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x4B7FFF00), + /* 458 */ + ALU_CNDE_INT(_R127,_w, KC0(7),_w, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_x) + ALU_LAST, + ALU_LITERAL(0x477FFF00), + /* 459 */ + ALU_ADD(_R127,_z, ALU_SRC_PV,_w, ALU_SRC_1 _NEG,_x), + ALU_RECIP_IEEE(_R126,_y, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 460 */ + ALU_ADD(__,_y, ALU_SRC_PV,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0xC0000000), + /* 461 */ + ALU_MUL_IEEE(__,_x, ALU_SRC_PV,_y, ALU_SRC_0_5,_x) + ALU_LAST, + /* 462 */ + ALU_ADD(__,_w, ALU_SRC_PV,_x, ALU_SRC_1,_x) + ALU_LAST, + /* 463 */ + ALU_FLOOR(__,_z, ALU_SRC_PV,_w) + ALU_LAST, + /* 464 */ + ALU_MULADD_D2(_R127,_y, ALU_SRC_PV _NEG,_z, ALU_SRC_LITERAL,_x, _R127,_z) + ALU_LAST, + ALU_LITERAL(0x40000000), + /* 465 */ + ALU_MULADD(_R123,_x, _R0,_z, _R127,_w, ALU_SRC_PV _NEG,_y) + ALU_LAST, + /* 466 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_x) + ALU_LAST, + /* 467 */ + ALU_ADD(__,_z, _R127,_y, ALU_SRC_PV,_w) + ALU_LAST, + /* 468 */ + ALU_MUL_IEEE(_R20,_x, ALU_SRC_PV,_z, _R126,_y) + ALU_LAST, + }, + { + /* 469 */ + ALU_MOV(_R1,_y, ALU_SRC_0,_x), + ALU_MOV(_R1,_w, ALU_SRC_1,_x) + ALU_LAST, + /* 470 */ + ALU_MOV(_R0,_x, _R17,_x), + ALU_MOV(_R0,_y, _R17,_y), + ALU_MOV(_R0,_z, _R17,_z), + ALU_MOV(_R0,_w, _R17,_w) + ALU_LAST, + /* 471 */ + ALU_MOV(_R2,_x, _R20,_x), + ALU_MOV(_R2,_y, _R1,_y), + ALU_MOV(_R2,_z, _R1,_y), + ALU_MOV(_R2,_w, _R1,_w) + ALU_LAST, + /* 472 */ + ALU_MOV(_R1,_x, _R19,_x), + ALU_MOV(_R1,_y, _R19,_x), + ALU_MOV(_R1,_z, _R19,_x), + ALU_MOV(_R1,_w, _R19,_w) + ALU_LAST, + }, + { + TEX_SAMPLE(_R16,_x,_y,_z,_w, _R4,_x,_y,_0,_x, _t0, _s0), + }, + { + TEX_SAMPLE(_R9,_x,_y,_z,_w, _R4,_x,_y,_0,_x, _t0, _s0) XOFFSET(1), + TEX_SAMPLE(_R8,_x,_y,_z,_w, _R4,_x,_y,_0,_x, _t0, _s0) YOFFSET(1), + TEX_SAMPLE(_R7,_x,_y,_z,_w, _R4,_x,_y,_0,_x, _t0, _s0) XOFFSET(1) YOFFSET(1), + }, + { + TEX_SAMPLE(_R16,_x,_y,_z,_w, _R6,_x,_y,_0,_x, _t0, _s0), + }, + { + TEX_SAMPLE(_R9,_x,_y,_z,_w, _R6,_x,_y,_0,_x, _t0, _s0) XOFFSET(1), + TEX_SAMPLE(_R8,_x,_y,_z,_w, _R6,_x,_y,_0,_x, _t0, _s0) YOFFSET(1), + TEX_SAMPLE(_R7,_x,_y,_z,_w, _R6,_x,_y,_0,_x, _t0, _s0) XOFFSET(1) YOFFSET(1), + }, + { + TEX_GET_TEXTURE_INFO(_R5,_x,_y,_m,_m, _R4,_z,_z,_0,_z, _t0, _s0), + }, + { + TEX_LD(_R16,_x,_y,_z,_w, _R4,_x,_y,_0,_y, _t2, _s0), + }, + { + TEX_LD(_R6,_x,_y,_z,_w, _R4,_x,_z,_0,_z, _t2, _s0), + TEX_LD(_R4,_x,_y,_z,_w, _R4,_w,_y,_0,_y, _t2, _s0), + TEX_LD(_R5,_x,_y,_z,_w, _R5,_y,_w,_0,_w, _t2, _s0), + }, + { + TEX_LD(_R4,_x,_y,_z,_w, _R1,_x,_y,_0,_w, _t1, _s0), + }, +}; + +GX2PixelShader PShaderAllGX2 = { + { + .sq_pgm_resources_ps.num_gprs = 21, + .sq_pgm_resources_ps.stack_size = 4, + .sq_pgm_exports_ps.export_mode = 0x6, + .spi_ps_in_control_0.num_interp = 5, + .spi_ps_in_control_0.position_ena = TRUE, + .spi_ps_in_control_0.persp_gradient_ena = TRUE, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 5, + { { .semantic = 0, .default_val = 1 }, + { .semantic = 0, .default_val = 1 }, + { .semantic = 1, .default_val = 1 }, + { .semantic = 2, .default_val = 1 }, + { .semantic = 3, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_mask.output1_enable = 0xF, + .cb_shader_mask.output2_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .cb_shader_control.rt1_enable = TRUE, + .cb_shader_control.rt2_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + .db_shader_control.kill_enable = TRUE, + .spi_input_z = TRUE, + }, /* regs */ + .size = sizeof(PShaderAllCode), + .program = (uint8_t *)&PShaderAllCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[128]; + u64 alu[81]; /* 128 */ + u64 alu1[12]; /* 209 */ + u64 alu2[1]; /* 221 */ + u64 alu3[8]; /* 222 */ + u64 alu4[6]; /* 230 */ + u64 alu5[12]; /* 236 */ + u64 alu6[3]; /* 248 */ + u64 alu7[5]; /* 251 */ + u64 alu8[3]; /* 256 */ + u64 alu9[1]; /* 259 */ + u64 alu10[11]; /* 260 */ + u64 alu11[13]; /* 271 */ + u64 alu12[1]; /* 284 */ + u64 alu13[2]; /* 285 */ + u64 alu14[3]; /* 287 */ + u64 alu15[20]; /* 290 */ + u64 alu16[13]; /* 310 */ + u64 alu17[17]; /* 323 */ + u64 alu18[18]; /* 340 */ + u64 alu19[3]; /* 358 */ + u64 alu20[9]; /* 361 */ + u64 alu21[1]; /* 370 */ + u64 alu22[23]; /* 371 */ + u64 alu23[1]; /* 394 */ + u64 alu24[4]; /* 395 */ + u64 alu25[4]; /* 399 */ + u64 alu26[3]; /* 403 */ + u64 alu27[1]; /* 406 */ + u64 alu28[2]; /* 407 */ + u64 alu29[2]; /* 409 */ + u64 alu30[1]; /* 411 */ + u64 alu31[3]; /* 412 */ + u64 alu32[3]; /* 415 */ + u64 alu33[9]; /* 418 */ + u64 alu34[3]; /* 427 */ + u64 alu35[2]; /* 430 */ + u64 alu36[1]; /* 432 */ + u64 alu37[3]; /* 433 */ + u64 alu38[3]; /* 436 */ + u64 alu39[9]; /* 439 */ + u64 alu40[2]; /* 448 */ + u64 alu41[1]; /* 450 */ + u64 alu42[2]; /* 451 */ + u64 alu43[8]; /* 453 */ + u64 alu44[3]; /* 461 */ + u64 alu45[1]; /* 464 */ + u64 alu46[14]; /* 465 */ + u64 alu47[5]; /* 479 */ + u64 alu48[15]; /* 484 */ + u64 alu49[14]; /* 499 */ + u64 alu50[3]; /* 513 */ + u64 alu51[3]; /* 516 */ + u64 alu52[37]; /* 519 */ + u64 alu53[2]; /* 556 */ + u64 alu54[8]; /* 558 */ + u64 alu55[10]; /* 566 */ + u64 tex56[1 * 2]; /* 576 */ + u64 tex57[1 * 2]; /* 578 */ + u64 tex58[1 * 2]; /* 580 */ + u64 tex59[1 * 2]; /* 582 */ + u64 tex60[2 * 2]; /* 584 */ + u64 tex61[2 * 2]; /* 588 */ + u64 tex62[1 * 2]; /* 592 */ + u64 tex63[2 * 2]; /* 594 */ +} VShaderHWNoSkinCode = +{ + { + CALL_FS NO_BARRIER, + ALU_PUSH_BEFORE(128,81) KCACHE0(CB1, _0_15) KCACHE1(CB4, _0_15), + JUMP(0,41), + ALU(209,12) KCACHE0(CB1, _16_31) KCACHE1(CB2, _0_15), + ALU_PUSH_BEFORE(221,1) KCACHE0(CB4, _0_15), + JUMP(1, 8), + ALU(222,8) KCACHE0(CB4, _0_15) KCACHE1(CB1, _16_31), + ALU_POP_AFTER(230,6) KCACHE0(CB2, _0_15), + ALU(236,12) KCACHE0(CB2, _0_15), + LOOP_START_DX10(40), + ALU_BREAK(248,3), + TEX(576,1), + ALU_PUSH_BEFORE(251,5), + JUMP(1, 39), + TEX(578,1), + ALU(256,3), + TEX(580,1), + ALU_PUSH_BEFORE(259,1), + JUMP(0,22), + ALU(260,11), + TEX(582,1), + ALU(271,13), + ELSE(1, 24), + ALU_POP_AFTER(284,1), + ALU_PUSH_BEFORE(285,2), + JUMP(1, 29), + ALU(287,3), + TEX(584,2), + ALU_POP_AFTER(290,20), + ALU(310,13), + TEX(588,2), + ALU_PUSH_BEFORE(323,17) KCACHE0(CB2, _0_15), + JUMP(1, 38), + ALU_PUSH_BEFORE(340,18), + JUMP(2, 38), + ALU(358,3), + TEX(592,1), + ALU_POP2_AFTER(361,9) KCACHE0(CB2, _0_15), + ALU_POP_AFTER(370,1), + LOOP_END(10), + ALU(371,23) KCACHE0(CB4, _0_15), + ELSE(1, 48), + ALU_PUSH_BEFORE(394,1) KCACHE0(CB4, _0_15), + JUMP(0,45), + ALU(395,4) KCACHE0(CB1, _16_31), + ELSE(1, 47), + ALU_POP_AFTER(399,4), + ALU_POP_AFTER(403,3), + ALU_PUSH_BEFORE(406,1) KCACHE0(CB4, _0_15), + JUMP(1, 101), + ALU_PUSH_BEFORE(407,2) KCACHE0(CB4, _0_15), + JUMP(0,61), + ALU_PUSH_BEFORE(409,2) KCACHE0(CB4, _0_15), + JUMP(0,59), + ALU_PUSH_BEFORE(411,1) KCACHE0(CB4, _0_15), + JUMP(0,57), + ALU(412,3) KCACHE0(CB1, _16_31), + ELSE(1, 59), + ALU_POP_AFTER(415,3), + ELSE(1, 61), + ALU_POP_AFTER(418,9) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + ELSE(1, 100), + ALU_PUSH_BEFORE(427,3) KCACHE0(CB4, _0_15), + JUMP(0,73), + ALU_PUSH_BEFORE(430,2) KCACHE0(CB4, _0_15), + JUMP(0,71), + ALU_PUSH_BEFORE(432,1) KCACHE0(CB4, _0_15), + JUMP(0,69), + ALU(433,3) KCACHE0(CB1, _16_31), + ELSE(1, 71), + ALU_POP_AFTER(436,3), + ELSE(1, 73), + ALU_POP_AFTER(439,9) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + ELSE(0,99), + ALU_PUSH_BEFORE(448,2) KCACHE0(CB4, _0_15), + JUMP(0,93), + ALU_PUSH_BEFORE(450,1) KCACHE0(CB4, _0_15), + JUMP(1, 92), + ALU_PUSH_BEFORE(451,2) KCACHE0(CB4, _0_15), + JUMP(0,81), + ALU(453,8) KCACHE0(CB4, _0_15), + ELSE(0,91), + ALU_PUSH_BEFORE(461,3) KCACHE0(CB4, _0_15), + JUMP(0,89), + ALU_PUSH_BEFORE(464,1) KCACHE0(CB4, _0_15), + JUMP(0,87), + ALU(465,14) KCACHE0(CB4, _0_15), + ELSE(1, 89), + ALU_POP_AFTER(479,5), + ELSE(1, 91), + ALU_POP_AFTER(484,15) KCACHE0(CB4, _0_15), + POP(2, 92), + ALU(499,14) KCACHE0(CB1, _0_31), + ELSE(1, 99), + ALU_PUSH_BEFORE(513,3) KCACHE0(CB4, _0_15), + JUMP(5, 101), + ALU(516,3) KCACHE0(CB4, _0_15), + TEX(594,2), + ALU_POP2_AFTER(519,37) KCACHE0(CB1, _16_31), + POP(2, 100), + POP(1, 101), + ALU_PUSH_BEFORE(556,2) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + JUMP(1, 104), + ALU_POP_AFTER(558,8) KCACHE0(CB1, _16_31), + ALU(566,2) KCACHE0(CB1, _16_31), + EXP_DONE(POS0, _R9,_x,_y,_z,_w), + EXP(PARAM0, _R5,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM1, _R8,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM2, _R15,_x,_y,_z,_w) NO_BARRIER, + EXP_DONE(PARAM3, _R0,_x,_y,_y,_y) NO_BARRIER + END_OF_PROGRAM + }, + { + /* 0 */ + ALU_MUL(_R127,_x, KC0(3),_x, ALU_SRC_1,_x), + ALU_MUL(_R127,_y, KC0(3),_z, ALU_SRC_1,_x), + ALU_MUL(_R127,_z, KC0(3),_y, ALU_SRC_1,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_x), + ALU_MUL(_R126,_x, KC0(3),_w, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 1 */ + ALU_DOT4(_R6,_x, _R3,_x, KC0(11),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(11),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(11),_z), + ALU_DOT4(__,_w, ALU_SRC_PV,_w, KC0(11),_w), + ALU_MOV(_R7,_x, ALU_SRC_0,_x) + ALU_LAST, + /* 2 */ + ALU_DOT4(__,_x, _R3,_x, KC0(12),_x), + ALU_DOT4(_R5,_y, _R3,_y, KC0(12),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(12),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(12),_w), + ALU_MOV(_R6,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 3 */ + ALU_DOT4(__,_x, _R3,_x, KC0(13),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(13),_y), + ALU_DOT4(_R4,_z, _R3,_z, KC0(13),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(13),_w), + ALU_MOV(_R0,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 4 */ + ALU_DOT4(__,_x, _R6,_x, KC0(10),_x), + ALU_DOT4(_R7,_y, _R5,_y, KC0(10),_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_x, KC0(10),_z), + ALU_DOT4(__,_w, ALU_SRC_PS,_x, KC0(10),_w), + ALU_MOV(_R6,_z, ALU_SRC_0,_x) + ALU_LAST, + /* 5 */ + ALU_DOT4(_R125,_x, _R6,_x, KC0(9),_x), + ALU_DOT4(__,_y, _R5,_y, KC0(9),_y), + ALU_DOT4(__,_z, _R4,_z, KC0(9),_z), + ALU_DOT4(__,_w, _R0,_w, KC0(9),_w), + ALU_MOV(_R2,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 6 */ + ALU_MULADD(_R127,_x, _R7,_y, KC0(2),_w, _R126,_x), + ALU_MULADD(_R127,_y, _R7,_y, KC0(2),_z, _R127,_y), + ALU_MULADD(_R127,_z, _R7,_y, KC0(2),_y, _R127,_z), + ALU_MULADD(_R127,_w, _R7,_y, KC0(2),_x, _R127,_x) VEC_021 + ALU_LAST, + /* 7 */ + ALU_DOT4(__,_x, _R6,_x, KC0(8),_x), + ALU_DOT4(__,_y, _R5,_y, KC0(8),_y), + ALU_DOT4(_R126,_z, _R4,_z, KC0(8),_z), + ALU_DOT4(__,_w, _R0,_w, KC0(8),_w) + ALU_LAST, + /* 8 */ + ALU_MULADD(_R123,_x, _R125,_x, KC0(1),_w, _R127,_x), + ALU_MULADD(_R123,_y, _R125,_x, KC0(1),_z, _R127,_y), + ALU_MULADD(_R123,_z, _R125,_x, KC0(1),_y, _R127,_z), + ALU_MULADD(_R123,_w, _R125,_x, KC0(1),_x, _R127,_w) + ALU_LAST, + /* 9 */ + ALU_MULADD(_R9,_x, _R126,_z, KC0(0),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R9,_y, _R126,_z, KC0(0),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R9,_z, _R126,_z, KC0(0),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R9,_w, _R126,_z, KC0(0),_w, ALU_SRC_PV,_x) + ALU_LAST, + /* 10 */ + ALU_CNDE_INT(_R123,_x, KC1(5),_w, ALU_SRC_0,_x, _R2,_x), + ALU_CNDE_INT(_R123,_y, KC1(5),_w, ALU_SRC_LITERAL,_x, _R2,_z), + ALU_CNDE_INT(_R123,_w, KC1(5),_w, ALU_SRC_0,_x, _R2,_y) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 11 */ + ALU_CNDE_INT(_R127,_x, KC1(6),_x, ALU_SRC_PV,_x, ALU_SRC_PV _NEG,_x), + ALU_CNDE_INT(_R127,_y, KC1(6),_x, ALU_SRC_PV,_w, ALU_SRC_PV _NEG,_w), + ALU_CNDE_INT(_R126,_z, KC1(6),_x, ALU_SRC_PV,_y, ALU_SRC_PV _NEG,_y) + ALU_LAST, + /* 12 */ + ALU_DOT4(_R125,_x, ALU_SRC_PV,_x, KC0(11),_x), + ALU_DOT4(__,_y, ALU_SRC_PV,_y, KC0(11),_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_z, KC0(11),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 13 */ + ALU_DOT4(__,_x, _R127,_x, KC0(12),_x), + ALU_DOT4(_R126,_y, _R127,_y, KC0(12),_y), + ALU_DOT4(__,_z, _R126,_z, KC0(12),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 14 */ + ALU_DOT4(__,_x, _R127,_x, KC0(13),_x), + ALU_DOT4(__,_y, _R127,_y, KC0(13),_y), + ALU_DOT4(_R126,_z, _R126,_z, KC0(13),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 15 */ + ALU_DOT4_IEEE(__,_x, _R125,_x, _R125,_x), + ALU_DOT4_IEEE(__,_y, _R126,_y, _R126,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 16 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 17 */ + ALU_MUL(_R17,_x, _R125,_x, ALU_SRC_PS,_x), + ALU_MUL(_R17,_y, _R126,_y, ALU_SRC_PS,_x), + ALU_MUL(_R7,_z, _R126,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 18 */ + ALU_PRED_SETNE_INT(__,_x, KC1(9),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 19 */ + ALU_MOV(_R10,_x, KC0(4),_x), + ALU_MOV(_R10,_y, KC0(4),_y), + ALU_MOV(_R10,_z, KC0(4),_z), + ALU_MOV(_R10,_w, KC0(4),_w) + ALU_LAST, + /* 20 */ + ALU_MOV(_R11,_x, KC1(1),_x), + ALU_MOV(_R11,_y, KC1(1),_y), + ALU_MOV(_R11,_z, KC1(1),_z), + ALU_MOV(_R0,_w, KC1(1),_w) + ALU_LAST, + /* 21 */ + ALU_MOV(_R12,_x, KC1(2),_x), + ALU_MOV(_R12,_y, KC1(2),_y), + ALU_MOV(_R12,_z, KC1(2),_z), + ALU_MOV(_R0,_w, KC1(2),_w) + ALU_LAST, + }, + { + /* 22 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 23 */ + ALU_AND_INT(_R0,_x, KC0(8),_w, ALU_SRC_LITERAL,_x), + ALU_AND_INT(__,_y, KC0(8),_w, ALU_SRC_1_INT,_x), + ALU_AND_INT(_R0,_w, KC0(8),_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000002, 0x00000004), + /* 24 */ + ALU_CNDE_INT(_R10,_x, ALU_SRC_PV,_y, KC1(4),_x, _R1,_x), + ALU_CNDE_INT(_R10,_y, ALU_SRC_PV,_y, KC1(4),_y, _R1,_y), + ALU_CNDE_INT(_R10,_z, ALU_SRC_PV,_y, KC1(4),_z, _R1,_z), + ALU_CNDE_INT(_R10,_w, ALU_SRC_PV,_y, KC1(4),_w, _R1,_w) + ALU_LAST, + }, + { + /* 25 */ + ALU_CNDE_INT(_R11,_x, _R0,_x, KC0(1),_x, _R1,_x), + ALU_CNDE_INT(_R11,_y, _R0,_x, KC0(1),_y, _R1,_y), + ALU_CNDE_INT(_R11,_z, _R0,_x, KC0(1),_z, _R1,_z) + ALU_LAST, + /* 26 */ + ALU_CNDE_INT(_R12,_x, _R0,_w, KC0(2),_x, _R1,_x), + ALU_CNDE_INT(_R12,_y, _R0,_w, KC0(2),_y, _R1,_y), + ALU_CNDE_INT(_R12,_z, _R0,_w, KC0(2),_z, _R1,_z) + ALU_LAST, + }, + { + /* 27 */ + ALU_MOV(__,_x, ALU_SRC_0,_x), + ALU_MOV(__,_y, KC0(3),_z), + ALU_MOV(__,_z, KC0(3),_y), + ALU_MOV(__,_w, KC0(3),_x), + ALU_MOV(_R14,_x, ALU_SRC_0,_x) + ALU_LAST, + /* 28 */ + ALU_MULADD(_R13,_x, _R10,_x, KC0(0),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R13,_y, _R10,_y, KC0(0),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R13,_z, _R10,_z, KC0(0),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R13,_w, _R10,_w, KC0(0),_w, ALU_SRC_PV,_x), + ALU_MOV(_R14,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 29 */ + ALU_MOV(_R14,_z, ALU_SRC_0,_x), + ALU_MOV(_R4,_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 30 */ + ALU_SETGT_INT(_R0,_x, ALU_SRC_LITERAL,_x, _R4,_w) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 31 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 32 */ + ALU_ADD_INT(__,_x, _R4,_w, ALU_SRC_1_INT,_x), + ALU_CNDE_INT(_R0,_y, _R0,_z, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 33 */ + ALU_CNDE_INT(_R4,_w, _R0,_z, ALU_SRC_PV,_x, _R4,_w) + ALU_LAST, + /* 34 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 35 */ + ALU_ADD_INT(_R0,_z, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_SETE_INT(_R0,_w, _R16,_y, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + }, + { + /* 36 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 37 */ + ALU_ADD(_R5,_x, _R6 _NEG,_x, _R8,_x), + ALU_ADD_INT(_R0,_y, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_ADD(_R0,_z, _R5 _NEG,_y, _R8,_y), + ALU_ADD(_R1,_w, _R4 _NEG,_z, _R8,_z), + ALU_MOV(_R0,_x, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x0000000C, 0x3F800000), + /* 38 */ + ALU_DOT4(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4(__,_z, ALU_SRC_PV,_w, ALU_SRC_PV,_w), + ALU_DOT4(_R0,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + }, + { + /* 39 */ + ALU_SQRT_IEEE(__,_x, _R0,_w) SCL_210 + ALU_LAST, + /* 40 */ + ALU_MOV(__,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, ALU_SRC_PS,_x, ALU_SRC_PS,_x), + ALU_RECIP_IEEE(_R127,_w, ALU_SRC_PS,_x) SCL_210 + ALU_LAST, + /* 41 */ + ALU_DOT4(__,_x, _R0,_x, _R1,_x), + ALU_DOT4(__,_y, ALU_SRC_PV,_y, _R1,_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_z, _R1,_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL_IEEE(_R8,_x, _R5,_x, ALU_SRC_PS,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 42 */ + ALU_MUL_IEEE(_R8,_y, _R0,_z, _R127,_w), + ALU_MUL_IEEE(_R8,_z, _R1,_w, _R127,_w), + ALU_RECIP_IEEE(_R1,_w, ALU_SRC_PV,_x) CLAMP SCL_210 + ALU_LAST, + }, + { + /* 43 */ + ALU_MOV(_R1,_w, _R2,_w) + ALU_LAST, + }, + { + /* 44 */ + ALU_PRED_SETGE_INT(__,_x, _R16,_y, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000002), + }, + { + /* 45 */ + ALU_ADD_INT(_R0,_z, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_w, _R4,_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000010, 0x00000008), + }, + { + /* 46 */ + ALU_DOT4_IEEE(__,_x, _R1,_x, _R1,_x), + ALU_DOT4_IEEE(__,_y, _R1,_y, _R1,_y), + ALU_DOT4_IEEE(__,_z, _R1,_z, _R1,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 47 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 48 */ + ALU_MUL(__,_x, _R1,_x, ALU_SRC_PS,_x), + ALU_MUL(__,_y, _R1,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R1,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 49 */ + ALU_DOT4(__,_x, _R8,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, _R8,_y, ALU_SRC_PV,_y), + ALU_DOT4(__,_z, _R8,_z, ALU_SRC_PV,_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 50 */ + ALU_SETGE_DX10(_R127,_x, ALU_SRC_PV,_x, _R0,_x), + ALU_LOG_CLAMPED(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 51 */ + ALU_MUL(__,_z, _R0,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 52 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 53 */ + ALU_MUL(__,_x, _R1,_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 54 */ + ALU_CNDE_INT(_R1,_w, _R127,_x, ALU_SRC_0,_x, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 55 */ + ALU_SETE_INT(_R5,_x, _R16,_x, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_y, _R4,_w, ALU_SRC_LITERAL,_y), + ALU_ADD_INT(_R0,_w, _R4,_w, ALU_SRC_LITERAL,_z), + ALU_MUL(__,_x, _R7,_z, _R8,_z) + ALU_LAST, + ALU_LITERAL3(0x00000002, 0x00000014, 0x00000018), + /* 56 */ + ALU_DOT4(__,_x, _R17,_x, _R8,_x), + ALU_DOT4(__,_y, _R17,_y, _R8,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 57 */ + ALU_MAX(_R5,_z, ALU_SRC_PV,_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x33D6BF95), + }, + { + /* 58 */ + ALU_MUL(_R127,_x, _R11,_x, _R1,_x), + ALU_MUL(_R127,_z, _R11,_z, _R1,_z), + ALU_MUL(_R127,_w, _R11,_y, _R1,_y), + ALU_LOG_CLAMPED(__,_x, _R5,_z) SCL_210 + ALU_LAST, + /* 59 */ + ALU_MUL(_R126,_x, _R10,_x, _R0,_x), + ALU_MUL(_R127,_y, _R10,_y, _R0,_y), + ALU_MUL(_R126,_z, _R10,_z, _R0,_z), + ALU_MUL(__,_w, KC0(2),_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 60 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 61 */ + ALU_CNDE_INT(_R123,_y, _R5,_x, _R5,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 62 */ + ALU_MULADD(_R123,_x, ALU_SRC_PV,_y, _R127,_w, _R127,_y), + ALU_MULADD(_R123,_y, ALU_SRC_PV,_y, _R127,_x, _R126,_x), + ALU_MULADD(_R123,_w, ALU_SRC_PV,_y, _R127,_z, _R126,_z) + ALU_LAST, + /* 63 */ + ALU_MULADD(_R13,_x, _R1,_w, ALU_SRC_PV,_y, _R13,_x), + ALU_MULADD(_R13,_y, _R1,_w, ALU_SRC_PV,_x, _R13,_y), + ALU_MULADD(_R13,_z, _R1,_w, ALU_SRC_PV,_w, _R13,_z) + ALU_LAST, + /* 64 */ + ALU_PRED_SETNE_INT(__,_x, ALU_SRC_0,_x, _R16,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 65 */ + ALU_ADD(_R127,_x, _R8,_x, ALU_SRC_0,_x), + ALU_ADD(_R127,_y, _R8,_y, ALU_SRC_0,_x), + ALU_ADD(_R127,_z, _R8,_z, ALU_SRC_1,_x) + ALU_LAST, + /* 66 */ + ALU_DOT4_IEEE(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 67 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 68 */ + ALU_MUL(__,_x, _R127,_x, ALU_SRC_PS,_x), + ALU_MUL(__,_y, _R127,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R127,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 69 */ + ALU_DOT4(_R0,_x, _R17,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, _R17,_y, ALU_SRC_PV,_y), + ALU_DOT4(__,_z, _R7,_z, ALU_SRC_PV,_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 70 */ + ALU_PRED_SETGT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 71 */ + ALU_ADD_INT(_R0,_z, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_LOG_CLAMPED(_R1,_z, _R0,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000001C), + }, + { + /* 72 */ + ALU_MUL(_R127,_x, _R12,_x, _R0,_x), + ALU_MUL(__,_y, KC0(2),_w, _R1,_z), + ALU_MUL(_R127,_z, _R12,_y, _R0,_y), + ALU_MUL(_R127,_w, _R12,_z, _R0,_z) VEC_021 + ALU_LAST, + /* 73 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + /* 74 */ + ALU_MUL(__,_w, _R1,_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 75 */ + ALU_MULADD(_R14,_x, ALU_SRC_PV,_w, _R127,_x, _R14,_x), + ALU_MULADD(_R14,_y, ALU_SRC_PV,_w, _R127,_z, _R14,_y), + ALU_MULADD(_R14,_z, ALU_SRC_PV,_w, _R127,_w, _R14,_z) + ALU_LAST, + }, + { + /* 76 */ + ALU_ADD_INT(_R4,_w, _R4,_w, ALU_SRC_1_INT,_x) + ALU_LAST, + }, + { + /* 77 */ + ALU_MOV(_R127,_x, _R13,_z) CLAMP, + ALU_MOV(_R127,_y, _R13,_y) CLAMP, + ALU_MOV(_R127,_z, _R13,_x) CLAMP, + ALU_MOV(_R0,_w, ALU_SRC_0,_x), + ALU_MOV(__,_x, _R13,_w) CLAMP + ALU_LAST, + /* 78 */ + ALU_MOV(_R126,_x, _R13,_z) CLAMP, + ALU_MOV(_R126,_y, _R13,_y) CLAMP, + ALU_MOV(_R126,_z, _R13,_x) CLAMP, + ALU_ADD(_R127,_w, ALU_SRC_PV,_w, ALU_SRC_PS,_x) CLAMP, + ALU_MOV(_R126,_w, _R13,_w) CLAMP + ALU_LAST, + /* 79 */ + ALU_MOV(_R125,_x, _R14,_z) CLAMP, + ALU_MOV(_R127,_y, _R14,_y) CLAMP, + ALU_MOV(_R127,_z, _R14,_x) CLAMP, + ALU_ADD(__,_w, _R14,_x, _R127,_z) CLAMP, + ALU_ADD(__,_x, _R14,_y, _R127,_y) CLAMP + ALU_LAST, + /* 80 */ + ALU_ADD(__,_x, _R14,_z, _R127,_x) CLAMP, + ALU_CNDE_INT(_R5,_y, KC0(4),_x, ALU_SRC_PS,_x, _R126,_y), + ALU_CNDE_INT(_R5,_x, KC0(4),_x, ALU_SRC_PV,_w, _R126,_z) VEC_021 + ALU_LAST, + /* 81 */ + ALU_CNDE_INT(_R8,_x, KC0(4),_x, ALU_SRC_0,_x, _R127,_z), + ALU_CNDE_INT(_R8,_y, KC0(4),_x, ALU_SRC_0,_x, _R127,_y), + ALU_CNDE_INT(_R5,_z, KC0(4),_x, ALU_SRC_PV,_x, _R126,_x) VEC_021, + ALU_CNDE_INT(_R5,_w, KC0(4),_x, _R127,_w, _R126,_w), + ALU_CNDE_INT(_R8,_z, KC0(4),_x, ALU_SRC_0,_x, _R125,_x) VEC_021 + ALU_LAST, + }, + { + /* 82 */ + ALU_PRED_SETE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 83 */ + ALU_MOV(_R5,_x, KC0(4),_x), + ALU_MOV(_R5,_y, KC0(4),_y), + ALU_MOV(_R5,_z, KC0(4),_z), + ALU_MOV(_R5,_w, KC0(4),_w) + ALU_LAST, + }, + { + /* 84 */ + ALU_MOV(_R5,_x, _R1,_x), + ALU_MOV(_R5,_y, _R1,_y), + ALU_MOV(_R5,_z, _R1,_z), + ALU_MOV(_R5,_w, _R1,_w) + ALU_LAST, + }, + { + /* 85 */ + ALU_MOV(_R8,_x, _R7,_x), + ALU_MOV(_R8,_y, _R6,_y), + ALU_MOV(_R8,_z, _R6,_z) + ALU_LAST, + }, + { + /* 86 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 87 */ + ALU_SETNE_INT(_R0,_x, KC0(7),_y, ALU_SRC_0,_x) + ALU_LAST, + /* 88 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 89 */ + ALU_NOT_INT(_R0,_w, KC0(4),_y) + ALU_LAST, + /* 90 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 91 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 92 */ + ALU_MUL(_R15,_x, _R4,_x, KC0(1),_x), + ALU_MUL(_R15,_y, _R4,_y, KC0(1),_y), + ALU_MOV(_R15,_z, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 93 */ + ALU_MOV(_R15,_x, _R7,_x), + ALU_MOV(_R15,_y, _R6,_y), + ALU_MOV(_R15,_z, _R6,_z) + ALU_LAST, + }, + { + /* 94 */ + ALU_MULADD(_R127,_x, _R4,_x, KC0(1),_x, KC0(1),_z), + ALU_MOV(_R127,_y, ALU_SRC_0,_x), + ALU_MOV(_R127,_z, ALU_SRC_0,_x), + ALU_MULADD(_R127,_w, _R4,_y, KC0(1),_y, KC0(1),_w) + ALU_LAST, + /* 95 */ + ALU_MOV(__,_x, KC0(1),_w), + ALU_MOV(__,_y, KC0(1),_z) + ALU_LAST, + /* 96 */ + ALU_CNDE_INT(_R15,_x, KC1(6),_y, ALU_SRC_PV,_y, _R127,_x), + ALU_CNDE_INT(_R15,_y, KC1(6),_y, ALU_SRC_PV,_x, _R127,_w), + ALU_CNDE_INT(_R15,_z, KC1(6),_y, _R127,_y, _R127,_z) + ALU_LAST, + }, + { + /* 97 */ + ALU_SETNE_INT(_R0,_z, KC0(7),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 98 */ + ALU_PRED_SETE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 99 */ + ALU_NOT_INT(_R0,_y, KC0(4),_y) + ALU_LAST, + /* 100 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 101 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 102 */ + ALU_MUL(_R15,_x, _R4,_x, KC0(1),_x), + ALU_MUL(_R15,_y, _R4,_y, KC0(1),_y), + ALU_MOV(_R15,_z, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 103 */ + ALU_MOV(_R15,_x, _R7,_x), + ALU_MOV(_R15,_y, _R6,_y), + ALU_MOV(_R15,_z, _R6,_z) + ALU_LAST, + }, + { + /* 104 */ + ALU_MULADD(_R127,_x, _R4,_x, KC0(1),_x, KC0(1),_z), + ALU_MOV(_R127,_y, ALU_SRC_0,_x), + ALU_MOV(_R127,_z, ALU_SRC_0,_x), + ALU_MULADD(_R127,_w, _R4,_y, KC0(1),_y, KC0(1),_w) + ALU_LAST, + /* 105 */ + ALU_MOV(__,_x, KC0(1),_w), + ALU_MOV(__,_y, KC0(1),_z) + ALU_LAST, + /* 106 */ + ALU_CNDE_INT(_R15,_x, KC1(6),_y, ALU_SRC_PV,_y, _R127,_x), + ALU_CNDE_INT(_R15,_y, KC1(6),_y, ALU_SRC_PV,_x, _R127,_w), + ALU_CNDE_INT(_R15,_z, KC1(6),_y, _R127,_y, _R127,_z) + ALU_LAST, + }, + { + /* 107 */ + ALU_SETNE_INT(_R0,_x, KC0(7),_y, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 108 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 109 */ + ALU_PRED_SETNE_INT(__,_x, KC0(7),_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 110 */ + ALU_SETNE_INT(_R0,_w, KC0(7),_z, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 111 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 112 */ + ALU_CNDE_INT(_R3,_x, KC0(6),_y, ALU_SRC_0,_x, _R4,_x), + ALU_CNDE_INT(_R3,_y, KC0(6),_y, ALU_SRC_0,_x, _R4,_y), + ALU_MOV(_R0,_z, ALU_SRC_0,_x), + ALU_MOV(_R0,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 113 */ + ALU_CNDE_INT(_R3,_z, KC0(6),_y, ALU_SRC_0,_x, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R3,_w, KC0(6),_y, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_w) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 114 */ + ALU_SETNE_INT(_R0,_z, KC0(7),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 115 */ + ALU_PRED_SETE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 116 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 117 */ + ALU_CNDE_INT(_R127,_x, KC0(6),_x, _R2,_x, _R2 _NEG,_x), + ALU_CNDE_INT(_R127,_y, KC0(6),_x, _R2,_y, _R2 _NEG,_y), + ALU_CNDE_INT(_R127,_z, KC0(6),_x, _R2,_z, _R2 _NEG,_z) + ALU_LAST, + /* 118 */ + ALU_DOT4_IEEE(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 119 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 120 */ + ALU_MUL(_R3,_x, _R127,_x, ALU_SRC_PS,_x), + ALU_MUL(_R3,_y, _R127,_y, ALU_SRC_PS,_x), + ALU_MUL(_R3,_z, _R127,_z, ALU_SRC_PS,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 121 */ + ALU_MOV(_R3,_x, ALU_SRC_0,_x), + ALU_MOV(_R3,_y, ALU_SRC_0,_x), + ALU_MOV(_R3,_z, ALU_SRC_LITERAL,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 122 */ + ALU_MOV(__,_x, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_y, KC0(6),_x, _R2,_z, _R2 _NEG,_z), + ALU_CNDE_INT(_R123,_z, KC0(6),_x, _R2,_y, _R2 _NEG,_y), + ALU_CNDE_INT(_R123,_w, KC0(6),_x, _R2,_x, _R2 _NEG,_x), + ALU_SETNE_INT(_R127,_x, KC0(7),_z, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x3F800000, 0x00000003), + /* 123 */ + ALU_CNDE_INT(_R123,_x, KC0(5),_w, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R123,_y, KC0(5),_w, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R123,_z, KC0(5),_w, ALU_SRC_0,_x, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R123,_w, KC0(5),_w, ALU_SRC_0,_x, ALU_SRC_PV,_w) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 124 */ + ALU_CNDE_INT(_R3,_x, _R127,_x, ALU_SRC_PV,_w, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_y, _R127,_x, ALU_SRC_PV,_z, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_z, _R127,_x, ALU_SRC_PV,_y, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_w, _R127,_x, ALU_SRC_PV,_x, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 125 */ + ALU_DOT4(_R127,_x, _R3,_x, KC0(14),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(14),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(14),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(14),_w) + ALU_LAST, + /* 126 */ + ALU_DOT4(__,_x, _R3,_x, KC0(15),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(15),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(15),_z), + ALU_DOT4(_R127,_w, _R3,_w, KC0(15),_w) + ALU_LAST, + /* 127 */ + ALU_DOT4(__,_x, _R3,_x, KC0(16),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(16),_y), + ALU_DOT4(_R15,_z, _R3,_z, KC0(16),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(16),_w) + ALU_LAST, + /* 128 */ + ALU_MUL(_R15,_x, KC0(17),_x, _R127,_x), + ALU_MUL(_R15,_y, KC0(17),_y, _R127,_w) + ALU_LAST, + }, + { + /* 129 */ + ALU_SETNE_INT(_R0,_z, KC0(7),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 130 */ + ALU_PRED_SETE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 131 */ + ALU_ADD_INT(_R0,_x, KC0(8),_x, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_w, KC0(7),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + }, + { + /* 132 */ + ALU_DOT4_IEEE(__,_x, _R1,_x, _R1,_x), + ALU_DOT4_IEEE(__,_y, _R1,_y, _R1,_y), + ALU_DOT4_IEEE(__,_z, _R1,_z, _R1,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL_IEEE(__,_x, _R0,_z, _R0,_z) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 133 */ + ALU_DOT4_IEEE(__,_x, _R0,_x, _R0,_x), + ALU_DOT4_IEEE(__,_y, _R0,_y, _R0,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 134 */ + ALU_MUL(_R127,_x, _R1,_x, ALU_SRC_PS,_x), + ALU_MUL(_R127,_y, _R1,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R1,_z, ALU_SRC_PS,_x), + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 135 */ + ALU_MUL(_R126,_x, _R0,_x, ALU_SRC_PS,_x), + ALU_MUL(_R126,_y, _R0,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R0,_z, ALU_SRC_PS,_x), + ALU_MUL(__,_x, _R7,_z, ALU_SRC_PV,_z) + ALU_LAST, + /* 136 */ + ALU_DOT4(__,_x, _R17,_x, _R127,_x), + ALU_DOT4(__,_y, _R17,_y, _R127,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL(__,_x, _R7,_z, ALU_SRC_PV,_z) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 137 */ + ALU_DOT4(__,_x, _R17,_x, _R126,_x), + ALU_DOT4(__,_y, _R17,_y, _R126,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_ADD_D2(__,_x, ALU_SRC_PV,_x, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 138 */ + ALU_MUL(_R15,_x, ALU_SRC_PS,_x, KC0(1),_x), + ALU_ADD_D2(__,_w, ALU_SRC_PV,_x, ALU_SRC_1,_x) + ALU_LAST, + /* 139 */ + ALU_MUL(_R15,_y, ALU_SRC_PV,_w, KC0(1),_y), + ALU_MOV(_R15,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 140 */ + ALU_ADD(_R0,_y, _R7,_y, KC0(3),_x) + ALU_LAST, + /* 141 */ + ALU_PRED_SETNE_INT(__,_x, KC1(10),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 142 */ + ALU_RECIP_IEEE(__,_x, _R9,_w) SCL_210 + ALU_LAST, + /* 143 */ + ALU_MUL_IEEE(__,_y, _R9,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 144 */ + ALU_MUL(__,_x, ALU_SRC_PV,_y, KC0(2),_x) + ALU_LAST, + /* 145 */ + ALU_ADD(__,_z, KC0(2),_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 146 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_z) + ALU_LAST, + /* 147 */ + ALU_ADD(__,_y, KC0(2) _NEG,_z, ALU_SRC_PV,_w) + ALU_LAST, + /* 148 */ + ALU_MUL(__,_x, KC0(2),_w, ALU_SRC_PV,_y) + ALU_LAST, + /* 149 */ + ALU_MUL(_R9,_z, _R9,_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 150 */ + ALU_NOP(__,_x), + ALU_MUL(_R0,_x, KC0(3),_y, _R0,_y) + ALU_LAST, + }, + { + VTX_FETCH(_R0,_m,_m,_z,_m, _R4,_w, (132), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R16,_x,_y,_m,_m, _R4,_w, (132), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R8,_x,_y,_z,_m, _R0,_z, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_y, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_m,_m, _R0,_z, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_y, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_z, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_x, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, +}; + +static GX2LoopVar VShaderLoopVars[] = +{ + { 0x00000000, 0xFFFFFFFF }, +}; + +GX2VertexShader VShaderHWNoSkinGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 18, + .sq_pgm_resources_vs.stack_size = 3, + .spi_vs_out_config.vs_export_count = 3, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0x01, .semantic_2 = 0x03, .semantic_3 = 0x02 }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0xF, + .num_sq_vtx_semantic = 4, + { + 2, 4, 0, 1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(VShaderHWNoSkinCode), + .program = (u8 *)&VShaderHWNoSkinCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .loopVarCount = countof(VShaderLoopVars), VShaderLoopVars, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, + +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[128]; + u64 alu[39]; /* 128 */ + u64 alu1[3]; /* 167 */ + u64 alu2[8]; /* 170 */ + u64 alu3[25]; /* 178 */ + u64 alu4[91]; /* 203 */ + u64 alu5[12]; /* 294 */ + u64 alu6[1]; /* 306 */ + u64 alu7[8]; /* 307 */ + u64 alu8[6]; /* 315 */ + u64 alu9[12]; /* 321 */ + u64 alu10[3]; /* 333 */ + u64 alu11[5]; /* 336 */ + u64 alu12[3]; /* 341 */ + u64 alu13[1]; /* 344 */ + u64 alu14[11]; /* 345 */ + u64 alu15[13]; /* 356 */ + u64 alu16[1]; /* 369 */ + u64 alu17[2]; /* 370 */ + u64 alu18[3]; /* 372 */ + u64 alu19[20]; /* 375 */ + u64 alu20[13]; /* 395 */ + u64 alu21[17]; /* 408 */ + u64 alu22[18]; /* 425 */ + u64 alu23[3]; /* 443 */ + u64 alu24[9]; /* 446 */ + u64 alu25[1]; /* 455 */ + u64 alu26[23]; /* 456 */ + u64 alu27[1]; /* 479 */ + u64 alu28[4]; /* 480 */ + u64 alu29[4]; /* 484 */ + u64 alu30[3]; /* 488 */ + u64 alu31[1]; /* 491 */ + u64 alu32[2]; /* 492 */ + u64 alu33[2]; /* 494 */ + u64 alu34[1]; /* 496 */ + u64 alu35[3]; /* 497 */ + u64 alu36[3]; /* 500 */ + u64 alu37[9]; /* 503 */ + u64 alu38[3]; /* 512 */ + u64 alu39[2]; /* 515 */ + u64 alu40[1]; /* 517 */ + u64 alu41[3]; /* 518 */ + u64 alu42[3]; /* 521 */ + u64 alu43[9]; /* 524 */ + u64 alu44[2]; /* 533 */ + u64 alu45[1]; /* 535 */ + u64 alu46[2]; /* 536 */ + u64 alu47[8]; /* 538 */ + u64 alu48[3]; /* 546 */ + u64 alu49[1]; /* 549 */ + u64 alu50[14]; /* 550 */ + u64 alu51[5]; /* 564 */ + u64 alu52[15]; /* 569 */ + u64 alu53[14]; /* 584 */ + u64 alu54[3]; /* 598 */ + u64 alu55[3]; /* 601 */ + u64 alu56[37]; /* 604 */ + u64 alu57[2]; /* 641 */ + u64 alu58[8]; /* 643 */ + u64 alu59[5]; /* 651 */ + u64 tex60[1 * 2]; /* 656 */ + u64 tex61[2 * 2]; /* 658 */ + u64 tex62[1 * 2]; /* 662 */ + u64 tex63[1 * 2]; /* 664 */ + u64 tex64[1 * 2]; /* 666 */ + u64 tex65[1 * 2]; /* 668 */ + u64 tex66[2 * 2]; /* 670 */ + u64 tex67[2 * 2]; /* 674 */ + u64 tex68[1 * 2]; /* 678 */ + u64 tex69[2 * 2]; /* 680 */ +} VShaderHWSkinCode = +{ + { + CALL_FS NO_BARRIER, + ALU(128,39) KCACHE0(CB4, _0_15) KCACHE1(CB3, _0_15), + LOOP_START_DX10(9), + ALU_BREAK(167,3) KCACHE0(CB4, _0_15), + TEX(656,1), + ALU(170,8), + TEX(658,2), + ALU(178,25), + LOOP_END(3), + ALU_PUSH_BEFORE(203,91) KCACHE0(CB1, _0_15) KCACHE1(CB4, _0_15), + JUMP(0,49), + ALU(294,12) KCACHE0(CB1, _16_31) KCACHE1(CB2, _0_15), + ALU_PUSH_BEFORE(306,1) KCACHE0(CB4, _0_15), + JUMP(1, 16), + ALU(307,8) KCACHE0(CB4, _0_15) KCACHE1(CB1, _16_31), + ALU_POP_AFTER(315,6) KCACHE0(CB2, _0_15), + ALU(321,12) KCACHE0(CB2, _0_15), + LOOP_START_DX10(48), + ALU_BREAK(333,3), + TEX(662,1), + ALU_PUSH_BEFORE(336,5), + JUMP(1, 47), + TEX(664,1), + ALU(341,3), + TEX(666,1), + ALU_PUSH_BEFORE(344,1), + JUMP(0,30), + ALU(345,11), + TEX(668,1), + ALU(356,13), + ELSE(1, 32), + ALU_POP_AFTER(369,1), + ALU_PUSH_BEFORE(370,2), + JUMP(1, 37), + ALU(372,3), + TEX(670,2), + ALU_POP_AFTER(375,20), + ALU(395,13), + TEX(674,2), + ALU_PUSH_BEFORE(408,17) KCACHE0(CB2, _0_15), + JUMP(1, 46), + ALU_PUSH_BEFORE(425,18), + JUMP(2, 46), + ALU(443,3), + TEX(678,1), + ALU_POP2_AFTER(446,9) KCACHE0(CB2, _0_15), + ALU_POP_AFTER(455,1), + LOOP_END(18), + ALU(456,23) KCACHE0(CB4, _0_15), + ELSE(1, 56), + ALU_PUSH_BEFORE(479,1) KCACHE0(CB4, _0_15), + JUMP(0,53), + ALU(480,4) KCACHE0(CB1, _16_31), + ELSE(1, 55), + ALU_POP_AFTER(484,4), + ALU_POP_AFTER(488,3), + ALU_PUSH_BEFORE(491,1) KCACHE0(CB4, _0_15), + JUMP(1, 109), + ALU_PUSH_BEFORE(492,2) KCACHE0(CB4, _0_15), + JUMP(0,69), + ALU_PUSH_BEFORE(494,2) KCACHE0(CB4, _0_15), + JUMP(0,67), + ALU_PUSH_BEFORE(496,1) KCACHE0(CB4, _0_15), + JUMP(0,65), + ALU(497,3) KCACHE0(CB1, _16_31), + ELSE(1, 67), + ALU_POP_AFTER(500,3), + ELSE(1, 69), + ALU_POP_AFTER(503,9) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + ELSE(1, 108), + ALU_PUSH_BEFORE(512,3) KCACHE0(CB4, _0_15), + JUMP(0,81), + ALU_PUSH_BEFORE(515,2) KCACHE0(CB4, _0_15), + JUMP(0,79), + ALU_PUSH_BEFORE(517,1) KCACHE0(CB4, _0_15), + JUMP(0,77), + ALU(518,3) KCACHE0(CB1, _16_31), + ELSE(1, 79), + ALU_POP_AFTER(521,3), + ELSE(1, 81), + ALU_POP_AFTER(524,9) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + ELSE(0,107), + ALU_PUSH_BEFORE(533,2) KCACHE0(CB4, _0_15), + JUMP(0,101), + ALU_PUSH_BEFORE(535,1) KCACHE0(CB4, _0_15), + JUMP(1, 100), + ALU_PUSH_BEFORE(536,2) KCACHE0(CB4, _0_15), + JUMP(0,89), + ALU(538,8) KCACHE0(CB4, _0_15), + ELSE(0,99), + ALU_PUSH_BEFORE(546,3) KCACHE0(CB4, _0_15), + JUMP(0,97), + ALU_PUSH_BEFORE(549,1) KCACHE0(CB4, _0_15), + JUMP(0,95), + ALU(550,14) KCACHE0(CB4, _0_15), + ELSE(1, 97), + ALU_POP_AFTER(564,5), + ELSE(1, 99), + ALU_POP_AFTER(569,15) KCACHE0(CB4, _0_15), + POP(2, 100), + ALU(584,14) KCACHE0(CB1, _0_31), + ELSE(1, 107), + ALU_PUSH_BEFORE(598,3) KCACHE0(CB4, _0_15), + JUMP(5, 109), + ALU(601,3) KCACHE0(CB4, _0_15), + TEX(680,2), + ALU_POP2_AFTER(604,37) KCACHE0(CB1, _16_31), + POP(2, 108), + POP(1, 109), + ALU_PUSH_BEFORE(641,2) KCACHE0(CB1, _16_31) KCACHE1(CB4, _0_15), + JUMP(1, 112), + ALU_POP_AFTER(643,8) KCACHE0(CB1, _16_31), + ALU(651,2) KCACHE0(CB1, _16_31), + EXP_DONE(POS0, _R12,_x,_y,_z,_w), + EXP(PARAM0, _R5,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM1, _R7,_x,_y,_z,_w) NO_BARRIER, + EXP(PARAM2, _R9,_x,_y,_z,_w) NO_BARRIER, + EXP_DONE(PARAM3, _R0,_x,_y,_y,_y) NO_BARRIER + END_OF_PROGRAM + }, + { + /* 0 */ + ALU_CNDE_INT(_R0,_y, KC0(5),_w, ALU_SRC_0,_x, _R2,_x), + ALU_MOV(_R0,_z, ALU_SRC_LITERAL,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000003, 0x3F800000), + /* 1 */ + ALU_CNDE_INT(_R7,_y, KC0(5),_w, ALU_SRC_0,_x, _R2,_y), + ALU_MOV(_R4,_z, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R0,_w, KC0(5),_w, ALU_SRC_LITERAL,_y, _R2,_z) + ALU_LAST, + ALU_LITERAL2(0x00000003, 0x3F800000), + /* 2 */ + ALU_MOV(_R2,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 3 */ + ALU_MOV(_R18,_x, _R5,_x) + ALU_LAST, + /* 4 */ + ALU_MOV(_R19,_x, _R5,_y) + ALU_LAST, + /* 5 */ + ALU_MOV(_R20,_x, _R5,_z) + ALU_LAST, + /* 6 */ + ALU_MOV(_R21,_x, _R5,_w) + ALU_LAST, + /* 7 */ + ALU_MOV(_R22,_x, _R6,_x) + ALU_LAST, + /* 8 */ + ALU_MOV(_R23,_x, _R6,_y) + ALU_LAST, + /* 9 */ + ALU_MOV(_R24,_x, _R6,_z) + ALU_LAST, + /* 10 */ + ALU_MOV(_R25,_x, _R6,_w) + ALU_LAST, + /* 11 */ + ALU_MUL(_R8,_x, _R5,_x, KC1(0),_x), + ALU_MUL(_R8,_y, _R5,_x, KC1(0),_y), + ALU_MUL(_R8,_z, _R5,_x, KC1(0),_z), + ALU_MUL(_R8,_w, _R5,_x, KC1(0),_w), + ALU_MOV(_R13,_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000001), + /* 12 */ + ALU_MUL(_R12,_x, _R5,_x, KC1(1),_x), + ALU_MUL(_R12,_y, _R5,_x, KC1(1),_y), + ALU_MUL(_R12,_z, _R5,_x, KC1(1),_z), + ALU_MUL(_R12,_w, _R5,_x, KC1(1),_w), + ALU_MOV(_R11,_x, ALU_SRC_0,_x) + ALU_LAST, + /* 13 */ + ALU_MUL(_R10,_x, _R5,_x, KC1(2),_x), + ALU_MUL(_R10,_y, _R5,_x, KC1(2),_y), + ALU_MUL(_R10,_z, _R5,_x, KC1(2),_z), + ALU_MUL(_R10,_w, _R5,_x, KC1(2),_w), + ALU_MOV(_R11,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 14 */ + ALU_CNDE_INT(_R14,_x, KC0(6),_x, _R0,_y, _R0 _NEG,_y) VEC_201, + ALU_NOP(__,_y), + ALU_MOV(_R11,_z, ALU_SRC_0,_x), + ALU_CNDE_INT(_R13,_y, KC0(6),_x, _R7,_y, _R7 _NEG,_y) VEC_021 + ALU_LAST, + /* 15 */ + ALU_CNDE_INT(_R13,_z, KC0(6),_x, _R0,_w, _R0 _NEG,_w) + ALU_LAST, + }, + { + /* 16 */ + ALU_ADD_INT(__,_x, KC0(8),_y, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 17 */ + ALU_SETGT_INT(_R0,_y, ALU_SRC_PV,_x, _R13,_x) + ALU_LAST, + /* 18 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 19 */ + ALU_MOV(_R0,_x, _R13,_x), + ALU_ADD_INT(_R0,_y, _R0,_z, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_z, _R4,_z, ALU_SRC_LITERAL,_y) VEC_120, + ALU_ADD_INT(_R0,_w, _R0,_z, ALU_SRC_1_INT,_x), + ALU_ADD_INT(_R13,_x, _R13,_x, ALU_SRC_1_INT,_x) + ALU_LAST, + ALU_LITERAL2(0x00000002, 0x00000003), + /* 20 */ + ALU_MOVA_INT(__,_x, _R0,_x) + ALU_LAST, + /* 21 */ + ALU_MOV(_R0,_x, _R18 AR,_x) + ALU_LAST, + }, + { + /* 22 */ + ALU_MUL(_R127,_x, _R0,_x, _R5,_w) VEC_201, + ALU_MUL(_R127,_y, _R0,_x, _R5,_y) VEC_201, + ALU_MUL(_R127,_z, _R0,_x, _R5,_z) VEC_201, + ALU_MUL(_R127,_w, _R0,_x, _R5,_x) VEC_201, + ALU_MUL(_R126,_y, _R0,_x, _R6,_x) + ALU_LAST, + /* 23 */ + ALU_MUL(_R126,_x, _R0,_x, _R6,_z), + ALU_MUL(_R125,_y, _R0,_x, _R7,_x), + ALU_MUL(_R126,_z, _R0,_x, _R6,_y) VEC_021, + ALU_MUL(_R126,_w, _R0,_x, _R6,_w), + ALU_MUL(_R125,_x, _R0,_x, _R7,_y) + ALU_LAST, + /* 24 */ + ALU_ADD(_R8,_x, _R8,_x, _R127,_w), + ALU_MUL(_R127,_y, _R0,_x, _R7,_w) VEC_120, + ALU_ADD(_R8,_z, _R8,_z, _R127,_z), + ALU_MUL(_R127,_w, _R0,_x, _R7,_z) VEC_120, + ALU_ADD(_R8,_y, _R8,_y, _R127,_y) + ALU_LAST, + /* 25 */ + ALU_ADD(_R12,_x, _R12,_x, _R126,_y), + ALU_ADD(_R12,_y, _R12,_y, _R126,_z), + ALU_ADD(_R12,_z, _R12,_z, _R126,_x), + ALU_ADD(_R8,_w, _R8,_w, _R127,_x) VEC_021, + ALU_ADD(_R12,_w, _R12,_w, _R126,_w) + ALU_LAST, + /* 26 */ + ALU_ADD(_R10,_x, _R10,_x, _R125,_y), + ALU_ADD(_R10,_y, _R10,_y, _R125,_x), + ALU_ADD(_R10,_z, _R10,_z, _R127,_w), + ALU_ADD(_R10,_w, _R10,_w, _R127,_y) VEC_021 + ALU_LAST, + /* 27 */ + ALU_MOV(_R4,_z, _R0,_z) + ALU_LAST, + }, + { + /* 28 */ + ALU_DOT4(__,_x, _R3,_x, _R8,_x), + ALU_DOT4(__,_y, _R3,_y, _R8,_y), + ALU_DOT4(__,_z, _R3,_z, _R8,_z), + ALU_DOT4(__,_w, _R3,_w, _R8,_w), + ALU_MUL(_R127,_w, KC0(3),_x, ALU_SRC_1,_x) + ALU_LAST, + /* 29 */ + ALU_DOT4(__,_x, _R3,_x, _R12,_x), + ALU_DOT4(__,_y, _R3,_y, _R12,_y), + ALU_DOT4(__,_z, _R3,_z, _R12,_z), + ALU_DOT4(__,_w, _R3,_w, _R12,_w), + ALU_MOV(_R0,_x, ALU_SRC_PV,_x) + ALU_LAST, + /* 30 */ + ALU_DOT4(__,_x, _R3,_x, _R10,_x), + ALU_DOT4(__,_y, _R3,_y, _R10,_y), + ALU_DOT4(__,_z, _R3,_z, _R10,_z), + ALU_DOT4(__,_w, _R3,_w, _R10,_w), + ALU_MOV(_R0,_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 31 */ + ALU_MUL(_R127,_x, KC0(3),_y, ALU_SRC_1,_x), + ALU_MUL(_R127,_y, KC0(3),_z, ALU_SRC_1,_x), + ALU_MOV(_R0,_z, ALU_SRC_PV,_x), + ALU_MUL(_R126,_w, KC0(3),_w, ALU_SRC_1,_x), + ALU_MUL(__,_x, _R13,_z, _R8,_z) + ALU_LAST, + /* 32 */ + ALU_DOT4(_R6,_x, _R0,_x, KC0(11),_x), + ALU_DOT4(__,_y, _R0,_y, KC0(11),_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_z, KC0(11),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(11),_w), + ALU_MULADD(_R122,_x, _R13,_y, _R8,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 33 */ + ALU_DOT4(__,_x, _R0,_x, KC0(12),_x), + ALU_DOT4(_R5,_y, _R0,_y, KC0(12),_y), + ALU_DOT4(__,_z, _R0,_z, KC0(12),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(12),_w), + ALU_MULADD(_R125,_x, _R14,_x, _R8,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 34 */ + ALU_DOT4(__,_x, _R0,_x, KC0(13),_x), + ALU_DOT4(__,_y, _R0,_y, KC0(13),_y), + ALU_DOT4(_R4,_z, _R0,_z, KC0(13),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(13),_w), + ALU_MOV(_R0,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 35 */ + ALU_DOT4(__,_x, _R6,_x, KC0(10),_x), + ALU_DOT4(_R6,_y, _R5,_y, KC0(10),_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_x, KC0(10),_z), + ALU_DOT4(__,_w, ALU_SRC_PS,_x, KC0(10),_w), + ALU_MUL(__,_x, _R13,_z, _R12,_z) + ALU_LAST, + /* 36 */ + ALU_DOT4(_R126,_x, _R6,_x, KC0(9),_x), + ALU_DOT4(__,_y, _R5,_y, KC0(9),_y), + ALU_DOT4(__,_z, _R4,_z, KC0(9),_z), + ALU_DOT4(__,_w, _R0,_w, KC0(9),_w), + ALU_MULADD(_R122,_x, _R13,_y, _R12,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 37 */ + ALU_MULADD(_R127,_x, _R6,_y, KC0(2),_w, _R126,_w), + ALU_MULADD(_R127,_y, _R6,_y, KC0(2),_z, _R127,_y), + ALU_MULADD(_R127,_z, _R6,_y, KC0(2),_y, _R127,_x) VEC_120, + ALU_MULADD(_R127,_w, _R6,_y, KC0(2),_x, _R127,_w) VEC_021, + ALU_MULADD(_R126,_y, _R14,_x, _R12,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 38 */ + ALU_DOT4(__,_x, _R6,_x, KC0(8),_x), + ALU_DOT4(__,_y, _R5,_y, KC0(8),_y), + ALU_DOT4(_R126,_z, _R4,_z, KC0(8),_z), + ALU_DOT4(__,_w, _R0,_w, KC0(8),_w), + ALU_MUL(__,_x, _R13,_z, _R10,_z) + ALU_LAST, + /* 39 */ + ALU_MULADD(_R123,_x, _R126,_x, KC0(1),_w, _R127,_x), + ALU_MULADD(_R123,_y, _R126,_x, KC0(1),_z, _R127,_y) VEC_120, + ALU_MULADD(_R123,_z, _R126,_x, KC0(1),_y, _R127,_z), + ALU_MULADD(_R123,_w, _R126,_x, KC0(1),_x, _R127,_w), + ALU_MULADD(_R122,_x, _R13,_y, _R10,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 40 */ + ALU_MULADD(_R12,_x, _R126,_z, KC0(0),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R12,_y, _R126,_z, KC0(0),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R12,_z, _R126,_z, KC0(0),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R12,_w, _R126,_z, KC0(0),_w, ALU_SRC_PV,_x), + ALU_MULADD(_R126,_z, _R14,_x, _R10,_x, ALU_SRC_PS,_x) + ALU_LAST, + /* 41 */ + ALU_DOT4(_R127,_x, _R125,_x, KC0(11),_x), + ALU_DOT4(__,_y, _R126,_y, KC0(11),_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, KC0(11),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 42 */ + ALU_DOT4(__,_x, _R125,_x, KC0(12),_x), + ALU_DOT4(_R127,_y, _R126,_y, KC0(12),_y), + ALU_DOT4(__,_z, _R126,_z, KC0(12),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 43 */ + ALU_DOT4(__,_x, _R125,_x, KC0(13),_x), + ALU_DOT4(__,_y, _R126,_y, KC0(13),_y), + ALU_DOT4(_R126,_z, _R126,_z, KC0(13),_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 44 */ + ALU_DOT4_IEEE(__,_x, _R127,_x, _R127,_x), + ALU_DOT4_IEEE(__,_y, _R127,_y, _R127,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 45 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 46 */ + ALU_MUL(_R16,_x, _R127,_x, ALU_SRC_PS,_x), + ALU_MUL(_R16,_y, _R127,_y, ALU_SRC_PS,_x), + ALU_MUL(_R6,_z, _R126,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 47 */ + ALU_PRED_SETNE_INT(__,_x, KC1(9),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 48 */ + ALU_MOV(_R13,_x, KC0(4),_x), + ALU_MOV(_R13,_y, KC0(4),_y), + ALU_MOV(_R13,_z, KC0(4),_z), + ALU_MOV(_R13,_w, KC0(4),_w) + ALU_LAST, + /* 49 */ + ALU_MOV(_R8,_x, KC1(1),_x), + ALU_MOV(_R8,_y, KC1(1),_y), + ALU_MOV(_R8,_z, KC1(1),_z), + ALU_MOV(_R0,_w, KC1(1),_w) + ALU_LAST, + /* 50 */ + ALU_MOV(_R7,_x, KC1(2),_x), + ALU_MOV(_R7,_y, KC1(2),_y), + ALU_MOV(_R7,_z, KC1(2),_z), + ALU_MOV(_R0,_w, KC1(2),_w) + ALU_LAST, + }, + { + /* 51 */ + ALU_PRED_SETNE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 52 */ + ALU_AND_INT(_R0,_x, KC0(8),_w, ALU_SRC_LITERAL,_x), + ALU_AND_INT(__,_y, KC0(8),_w, ALU_SRC_1_INT,_x), + ALU_AND_INT(_R0,_w, KC0(8),_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000002, 0x00000004), + /* 53 */ + ALU_CNDE_INT(_R13,_x, ALU_SRC_PV,_y, KC1(4),_x, _R1,_x), + ALU_CNDE_INT(_R13,_y, ALU_SRC_PV,_y, KC1(4),_y, _R1,_y), + ALU_CNDE_INT(_R13,_z, ALU_SRC_PV,_y, KC1(4),_z, _R1,_z), + ALU_CNDE_INT(_R13,_w, ALU_SRC_PV,_y, KC1(4),_w, _R1,_w) + ALU_LAST, + }, + { + /* 54 */ + ALU_CNDE_INT(_R8,_x, _R0,_x, KC0(1),_x, _R1,_x), + ALU_CNDE_INT(_R8,_y, _R0,_x, KC0(1),_y, _R1,_y), + ALU_CNDE_INT(_R8,_z, _R0,_x, KC0(1),_z, _R1,_z) + ALU_LAST, + /* 55 */ + ALU_CNDE_INT(_R7,_x, _R0,_w, KC0(2),_x, _R1,_x), + ALU_CNDE_INT(_R7,_y, _R0,_w, KC0(2),_y, _R1,_y), + ALU_CNDE_INT(_R7,_z, _R0,_w, KC0(2),_z, _R1,_z) + ALU_LAST, + }, + { + /* 56 */ + ALU_MOV(__,_x, ALU_SRC_0,_x), + ALU_MOV(__,_y, KC0(3),_z), + ALU_MOV(__,_z, KC0(3),_y), + ALU_MOV(__,_w, KC0(3),_x), + ALU_MOV(_R10,_x, ALU_SRC_0,_x) + ALU_LAST, + /* 57 */ + ALU_MULADD(_R14,_x, _R13,_x, KC0(0),_x, ALU_SRC_PV,_w), + ALU_MULADD(_R14,_y, _R13,_y, KC0(0),_y, ALU_SRC_PV,_z), + ALU_MULADD(_R14,_z, _R13,_z, KC0(0),_z, ALU_SRC_PV,_y), + ALU_MULADD(_R14,_w, _R13,_w, KC0(0),_w, ALU_SRC_PV,_x), + ALU_MOV(_R10,_y, ALU_SRC_0,_x) + ALU_LAST, + /* 58 */ + ALU_MOV(_R10,_z, ALU_SRC_0,_x), + ALU_MOV(_R4,_w, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 59 */ + ALU_SETGT_INT(_R0,_w, ALU_SRC_LITERAL,_x, _R4,_w) + ALU_LAST, + ALU_LITERAL(0x00000004), + /* 60 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 61 */ + ALU_CNDE_INT(_R0,_x, _R0,_z, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_ADD_INT(__,_y, _R4,_w, ALU_SRC_1_INT,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 62 */ + ALU_CNDE_INT(_R4,_w, _R0,_z, ALU_SRC_PV,_y, _R4,_w) + ALU_LAST, + /* 63 */ + ALU_PRED_SETE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 64 */ + ALU_SETE_INT(_R0,_z, _R17,_y, ALU_SRC_0,_x), + ALU_ADD_INT(_R0,_w, _R4,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + }, + { + /* 65 */ + ALU_PRED_SETE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 66 */ + ALU_ADD(_R5,_x, _R6 _NEG,_x, _R15,_x), + ALU_ADD_INT(_R0,_y, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_ADD(_R0,_z, _R5 _NEG,_y, _R15,_y), + ALU_ADD(_R1,_w, _R4 _NEG,_z, _R15,_z), + ALU_MOV(_R0,_x, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x0000000C, 0x3F800000), + /* 67 */ + ALU_DOT4(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4(__,_z, ALU_SRC_PV,_w, ALU_SRC_PV,_w), + ALU_DOT4(_R0,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + }, + { + /* 68 */ + ALU_SQRT_IEEE(__,_x, _R0,_w) SCL_210 + ALU_LAST, + /* 69 */ + ALU_MOV(__,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, ALU_SRC_PS,_x, ALU_SRC_PS,_x), + ALU_RECIP_IEEE(_R127,_w, ALU_SRC_PS,_x) SCL_210 + ALU_LAST, + /* 70 */ + ALU_DOT4(__,_x, _R0,_x, _R1,_x), + ALU_DOT4(__,_y, ALU_SRC_PV,_y, _R1,_y), + ALU_DOT4(__,_z, ALU_SRC_PV,_z, _R1,_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL_IEEE(_R15,_x, _R5,_x, ALU_SRC_PS,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 71 */ + ALU_MUL_IEEE(_R15,_y, _R0,_z, _R127,_w), + ALU_MUL_IEEE(_R15,_z, _R1,_w, _R127,_w), + ALU_RECIP_IEEE(_R1,_w, ALU_SRC_PV,_x) CLAMP SCL_210 + ALU_LAST, + }, + { + /* 72 */ + ALU_MOV(_R1,_w, _R2,_w) + ALU_LAST, + }, + { + /* 73 */ + ALU_PRED_SETGE_INT(__,_x, _R17,_y, ALU_SRC_LITERAL,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + ALU_LITERAL(0x00000002), + }, + { + /* 74 */ + ALU_ADD_INT(_R0,_z, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_w, _R4,_w, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x00000010, 0x00000008), + }, + { + /* 75 */ + ALU_DOT4_IEEE(__,_x, _R1,_x, _R1,_x), + ALU_DOT4_IEEE(__,_y, _R1,_y, _R1,_y), + ALU_DOT4_IEEE(__,_z, _R1,_z, _R1,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 76 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 77 */ + ALU_MUL(__,_x, _R1,_x, ALU_SRC_PS,_x), + ALU_MUL(__,_y, _R1,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R1,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 78 */ + ALU_DOT4(__,_x, _R15,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, _R15,_y, ALU_SRC_PV,_y), + ALU_DOT4(__,_z, _R15,_z, ALU_SRC_PV,_z), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 79 */ + ALU_SETGE_DX10(_R127,_x, ALU_SRC_PV,_x, _R0,_x), + ALU_LOG_CLAMPED(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 80 */ + ALU_MUL(__,_z, _R0,_y, ALU_SRC_PS,_x) + ALU_LAST, + /* 81 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_z) SCL_210 + ALU_LAST, + /* 82 */ + ALU_MUL(__,_x, _R1,_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 83 */ + ALU_CNDE_INT(_R1,_w, _R127,_x, ALU_SRC_0,_x, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 84 */ + ALU_SETE_INT(_R5,_x, _R17,_x, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_y, _R4,_w, ALU_SRC_LITERAL,_y), + ALU_ADD_INT(_R0,_w, _R4,_w, ALU_SRC_LITERAL,_z), + ALU_MUL(__,_x, _R6,_z, _R15,_z) + ALU_LAST, + ALU_LITERAL3(0x00000002, 0x00000014, 0x00000018), + /* 85 */ + ALU_DOT4(__,_x, _R16,_x, _R15,_x), + ALU_DOT4(__,_y, _R16,_y, _R15,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 86 */ + ALU_MAX(_R5,_z, ALU_SRC_PV,_x, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x33D6BF95), + }, + { + /* 87 */ + ALU_MUL(_R127,_x, _R8,_x, _R1,_x), + ALU_MUL(_R127,_z, _R8,_z, _R1,_z), + ALU_MUL(_R127,_w, _R8,_y, _R1,_y), + ALU_LOG_CLAMPED(__,_x, _R5,_z) SCL_210 + ALU_LAST, + /* 88 */ + ALU_MUL(_R126,_x, _R13,_x, _R0,_x), + ALU_MUL(_R127,_y, _R13,_y, _R0,_y), + ALU_MUL(_R126,_z, _R13,_z, _R0,_z), + ALU_MUL(__,_w, KC0(2),_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 89 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_w) SCL_210 + ALU_LAST, + /* 90 */ + ALU_CNDE_INT(_R123,_y, _R5,_x, _R5,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 91 */ + ALU_MULADD(_R123,_x, ALU_SRC_PV,_y, _R127,_w, _R127,_y), + ALU_MULADD(_R123,_y, ALU_SRC_PV,_y, _R127,_x, _R126,_x), + ALU_MULADD(_R123,_w, ALU_SRC_PV,_y, _R127,_z, _R126,_z) + ALU_LAST, + /* 92 */ + ALU_MULADD(_R14,_x, _R1,_w, ALU_SRC_PV,_y, _R14,_x), + ALU_MULADD(_R14,_y, _R1,_w, ALU_SRC_PV,_x, _R14,_y), + ALU_MULADD(_R14,_z, _R1,_w, ALU_SRC_PV,_w, _R14,_z) + ALU_LAST, + /* 93 */ + ALU_PRED_SETNE_INT(__,_x, ALU_SRC_0,_x, _R17,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 94 */ + ALU_ADD(_R127,_x, _R15,_x, ALU_SRC_0,_x), + ALU_ADD(_R127,_y, _R15,_y, ALU_SRC_0,_x), + ALU_ADD(_R127,_z, _R15,_z, ALU_SRC_1,_x) + ALU_LAST, + /* 95 */ + ALU_DOT4_IEEE(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 96 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 97 */ + ALU_MUL(__,_x, _R127,_x, ALU_SRC_PS,_x), + ALU_MUL(__,_y, _R127,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R127,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 98 */ + ALU_DOT4(__,_x, _R16,_x, ALU_SRC_PV,_x), + ALU_DOT4(__,_y, _R16,_y, ALU_SRC_PV,_y), + ALU_DOT4(__,_z, _R6,_z, ALU_SRC_PV,_z), + ALU_DOT4(_R0,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 99 */ + ALU_PRED_SETGT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 100 */ + ALU_ADD_INT(_R0,_z, _R4,_w, ALU_SRC_LITERAL,_x), + ALU_LOG_CLAMPED(_R1,_z, _R0,_w) SCL_210 + ALU_LAST, + ALU_LITERAL(0x0000001C), + }, + { + /* 101 */ + ALU_MUL(_R127,_x, _R7,_x, _R0,_x), + ALU_MUL(__,_y, KC0(2),_w, _R1,_z), + ALU_MUL(_R127,_z, _R7,_y, _R0,_y), + ALU_MUL(_R127,_w, _R7,_z, _R0,_z) VEC_021 + ALU_LAST, + /* 102 */ + ALU_EXP_IEEE(__,_x, ALU_SRC_PV,_y) SCL_210 + ALU_LAST, + /* 103 */ + ALU_MUL(__,_w, _R1,_w, ALU_SRC_PS,_x) + ALU_LAST, + /* 104 */ + ALU_MULADD(_R10,_x, ALU_SRC_PV,_w, _R127,_x, _R10,_x), + ALU_MULADD(_R10,_y, ALU_SRC_PV,_w, _R127,_z, _R10,_y), + ALU_MULADD(_R10,_z, ALU_SRC_PV,_w, _R127,_w, _R10,_z) + ALU_LAST, + }, + { + /* 105 */ + ALU_ADD_INT(_R4,_w, _R4,_w, ALU_SRC_1_INT,_x) + ALU_LAST, + }, + { + /* 106 */ + ALU_MOV(_R127,_x, _R14,_z) CLAMP, + ALU_MOV(_R127,_y, _R14,_y) CLAMP, + ALU_MOV(_R127,_z, _R14,_x) CLAMP, + ALU_MOV(_R0,_w, ALU_SRC_0,_x), + ALU_MOV(__,_x, _R14,_w) CLAMP + ALU_LAST, + /* 107 */ + ALU_MOV(_R126,_x, _R14,_z) CLAMP, + ALU_MOV(_R126,_y, _R14,_y) CLAMP, + ALU_MOV(_R126,_z, _R14,_x) CLAMP, + ALU_ADD(_R127,_w, ALU_SRC_PV,_w, ALU_SRC_PS,_x) CLAMP, + ALU_MOV(_R126,_w, _R14,_w) CLAMP + ALU_LAST, + /* 108 */ + ALU_MOV(_R125,_x, _R10,_z) CLAMP, + ALU_MOV(_R127,_y, _R10,_y) CLAMP, + ALU_MOV(_R127,_z, _R10,_x) CLAMP, + ALU_ADD(__,_w, _R10,_x, _R127,_z) CLAMP, + ALU_ADD(__,_x, _R10,_y, _R127,_y) CLAMP + ALU_LAST, + /* 109 */ + ALU_ADD(__,_x, _R10,_z, _R127,_x) CLAMP, + ALU_CNDE_INT(_R5,_y, KC0(4),_x, ALU_SRC_PS,_x, _R126,_y), + ALU_CNDE_INT(_R5,_x, KC0(4),_x, ALU_SRC_PV,_w, _R126,_z) VEC_021 + ALU_LAST, + /* 110 */ + ALU_CNDE_INT(_R7,_x, KC0(4),_x, ALU_SRC_0,_x, _R127,_z), + ALU_CNDE_INT(_R7,_y, KC0(4),_x, ALU_SRC_0,_x, _R127,_y), + ALU_CNDE_INT(_R5,_z, KC0(4),_x, ALU_SRC_PV,_x, _R126,_x) VEC_021, + ALU_CNDE_INT(_R5,_w, KC0(4),_x, _R127,_w, _R126,_w), + ALU_CNDE_INT(_R7,_z, KC0(4),_x, ALU_SRC_0,_x, _R125,_x) VEC_021 + ALU_LAST, + }, + { + /* 111 */ + ALU_PRED_SETE_INT(__,_x, KC0(4),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 112 */ + ALU_MOV(_R5,_x, KC0(4),_x), + ALU_MOV(_R5,_y, KC0(4),_y), + ALU_MOV(_R5,_z, KC0(4),_z), + ALU_MOV(_R5,_w, KC0(4),_w) + ALU_LAST, + }, + { + /* 113 */ + ALU_MOV(_R5,_x, _R1,_x), + ALU_MOV(_R5,_y, _R1,_y), + ALU_MOV(_R5,_z, _R1,_z), + ALU_MOV(_R5,_w, _R1,_w) + ALU_LAST, + }, + { + /* 114 */ + ALU_MOV(_R7,_x, _R11,_x), + ALU_MOV(_R7,_y, _R11,_y), + ALU_MOV(_R7,_z, _R11,_z) + ALU_LAST, + }, + { + /* 115 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 116 */ + ALU_SETNE_INT(_R0,_w, KC0(7),_y, ALU_SRC_0,_x) + ALU_LAST, + /* 117 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 118 */ + ALU_NOT_INT(_R0,_z, KC0(4),_y) + ALU_LAST, + /* 119 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 120 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 121 */ + ALU_MUL(_R9,_x, _R4,_x, KC0(1),_x), + ALU_MUL(_R9,_y, _R4,_y, KC0(1),_y), + ALU_MOV(_R9,_z, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 122 */ + ALU_MOV(_R9,_x, _R11,_x), + ALU_MOV(_R9,_y, _R11,_y), + ALU_MOV(_R9,_z, _R11,_z) + ALU_LAST, + }, + { + /* 123 */ + ALU_MULADD(_R127,_x, _R4,_x, KC0(1),_x, KC0(1),_z), + ALU_MOV(_R127,_y, ALU_SRC_0,_x), + ALU_MOV(_R127,_z, ALU_SRC_0,_x), + ALU_MULADD(_R127,_w, _R4,_y, KC0(1),_y, KC0(1),_w) + ALU_LAST, + /* 124 */ + ALU_MOV(__,_x, KC0(1),_w), + ALU_MOV(__,_y, KC0(1),_z) + ALU_LAST, + /* 125 */ + ALU_CNDE_INT(_R9,_x, KC1(6),_y, ALU_SRC_PV,_y, _R127,_x), + ALU_CNDE_INT(_R9,_y, KC1(6),_y, ALU_SRC_PV,_x, _R127,_w), + ALU_CNDE_INT(_R9,_z, KC1(6),_y, _R127,_y, _R127,_z) + ALU_LAST, + }, + { + /* 126 */ + ALU_SETNE_INT(_R0,_y, KC0(7),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000003), + /* 127 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 128 */ + ALU_NOT_INT(_R0,_x, KC0(4),_y) + ALU_LAST, + /* 129 */ + ALU_PRED_SETNE_INT(__,_x, _R0,_x, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 130 */ + ALU_PRED_SETNE_INT(__,_x, KC0(6),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 131 */ + ALU_MUL(_R9,_x, _R4,_x, KC0(1),_x), + ALU_MUL(_R9,_y, _R4,_y, KC0(1),_y), + ALU_MOV(_R9,_z, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 132 */ + ALU_MOV(_R9,_x, _R11,_x), + ALU_MOV(_R9,_y, _R11,_y), + ALU_MOV(_R9,_z, _R11,_z) + ALU_LAST, + }, + { + /* 133 */ + ALU_MULADD(_R127,_x, _R4,_x, KC0(1),_x, KC0(1),_z), + ALU_MOV(_R127,_y, ALU_SRC_0,_x), + ALU_MOV(_R127,_z, ALU_SRC_0,_x), + ALU_MULADD(_R127,_w, _R4,_y, KC0(1),_y, KC0(1),_w) + ALU_LAST, + /* 134 */ + ALU_MOV(__,_x, KC0(1),_w), + ALU_MOV(__,_y, KC0(1),_z) + ALU_LAST, + /* 135 */ + ALU_CNDE_INT(_R9,_x, KC1(6),_y, ALU_SRC_PV,_y, _R127,_x), + ALU_CNDE_INT(_R9,_y, KC1(6),_y, ALU_SRC_PV,_x, _R127,_w), + ALU_CNDE_INT(_R9,_z, KC1(6),_y, _R127,_y, _R127,_z) + ALU_LAST, + }, + { + /* 136 */ + ALU_SETNE_INT(_R0,_w, KC0(7),_y, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 137 */ + ALU_PRED_SETE_INT(__,_x, _R0,_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 138 */ + ALU_PRED_SETNE_INT(__,_x, KC0(7),_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 139 */ + ALU_SETNE_INT(_R0,_z, KC0(7),_z, ALU_SRC_1_INT,_x) + ALU_LAST, + /* 140 */ + ALU_PRED_SETE_INT(__,_x, _R0,_z, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 141 */ + ALU_CNDE_INT(_R3,_x, KC0(6),_y, ALU_SRC_0,_x, _R4,_x), + ALU_CNDE_INT(_R3,_y, KC0(6),_y, ALU_SRC_0,_x, _R4,_y), + ALU_MOV(_R0,_z, ALU_SRC_0,_x), + ALU_MOV(_R0,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 142 */ + ALU_CNDE_INT(_R3,_z, KC0(6),_y, ALU_SRC_0,_x, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R3,_w, KC0(6),_y, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_w) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 143 */ + ALU_SETNE_INT(_R0,_y, KC0(7),_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 144 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 145 */ + ALU_PRED_SETNE_INT(__,_x, KC0(5),_w, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 146 */ + ALU_CNDE_INT(_R127,_x, KC0(6),_x, _R2,_x, _R2 _NEG,_x), + ALU_CNDE_INT(_R127,_y, KC0(6),_x, _R2,_y, _R2 _NEG,_y), + ALU_CNDE_INT(_R127,_z, KC0(6),_x, _R2,_z, _R2 _NEG,_z) + ALU_LAST, + /* 147 */ + ALU_DOT4_IEEE(__,_x, ALU_SRC_PV,_x, ALU_SRC_PV,_x), + ALU_DOT4_IEEE(__,_y, ALU_SRC_PV,_y, ALU_SRC_PV,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PV,_z, ALU_SRC_PV,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 148 */ + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 149 */ + ALU_MUL(_R3,_x, _R127,_x, ALU_SRC_PS,_x), + ALU_MUL(_R3,_y, _R127,_y, ALU_SRC_PS,_x), + ALU_MUL(_R3,_z, _R127,_z, ALU_SRC_PS,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 150 */ + ALU_MOV(_R3,_x, ALU_SRC_0,_x), + ALU_MOV(_R3,_y, ALU_SRC_0,_x), + ALU_MOV(_R3,_z, ALU_SRC_LITERAL,_x), + ALU_MOV(_R3,_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 151 */ + ALU_MOV(__,_x, ALU_SRC_LITERAL,_x), + ALU_CNDE_INT(_R123,_y, KC0(6),_x, _R2,_z, _R2 _NEG,_z), + ALU_CNDE_INT(_R123,_z, KC0(6),_x, _R2,_y, _R2 _NEG,_y), + ALU_CNDE_INT(_R123,_w, KC0(6),_x, _R2,_x, _R2 _NEG,_x), + ALU_SETNE_INT(_R127,_x, KC0(7),_z, ALU_SRC_LITERAL,_y) + ALU_LAST, + ALU_LITERAL2(0x3F800000, 0x00000003), + /* 152 */ + ALU_CNDE_INT(_R123,_x, KC0(5),_w, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_x), + ALU_CNDE_INT(_R123,_y, KC0(5),_w, ALU_SRC_LITERAL,_x, ALU_SRC_PV,_y), + ALU_CNDE_INT(_R123,_z, KC0(5),_w, ALU_SRC_0,_x, ALU_SRC_PV,_z), + ALU_CNDE_INT(_R123,_w, KC0(5),_w, ALU_SRC_0,_x, ALU_SRC_PV,_w) + ALU_LAST, + ALU_LITERAL(0x3F800000), + /* 153 */ + ALU_CNDE_INT(_R3,_x, _R127,_x, ALU_SRC_PV,_w, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_y, _R127,_x, ALU_SRC_PV,_z, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_z, _R127,_x, ALU_SRC_PV,_y, ALU_SRC_0,_x), + ALU_CNDE_INT(_R3,_w, _R127,_x, ALU_SRC_PV,_x, ALU_SRC_0,_x) + ALU_LAST, + }, + { + /* 154 */ + ALU_DOT4(_R127,_x, _R3,_x, KC0(14),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(14),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(14),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(14),_w) + ALU_LAST, + /* 155 */ + ALU_DOT4(__,_x, _R3,_x, KC0(15),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(15),_y), + ALU_DOT4(__,_z, _R3,_z, KC0(15),_z), + ALU_DOT4(_R127,_w, _R3,_w, KC0(15),_w) + ALU_LAST, + /* 156 */ + ALU_DOT4(__,_x, _R3,_x, KC0(16),_x), + ALU_DOT4(__,_y, _R3,_y, KC0(16),_y), + ALU_DOT4(_R9,_z, _R3,_z, KC0(16),_z), + ALU_DOT4(__,_w, _R3,_w, KC0(16),_w) + ALU_LAST, + /* 157 */ + ALU_MUL(_R9,_x, KC0(17),_x, _R127,_x), + ALU_MUL(_R9,_y, KC0(17),_y, _R127,_w) + ALU_LAST, + }, + { + /* 158 */ + ALU_SETNE_INT(_R0,_y, KC0(7),_y, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000002), + /* 159 */ + ALU_PRED_SETE_INT(__,_x, _R0,_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 160 */ + ALU_ADD_INT(_R0,_x, KC0(8),_x, ALU_SRC_LITERAL,_x), + ALU_ADD_INT(_R0,_w, KC0(7),_w, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x00000004), + }, + { + /* 161 */ + ALU_DOT4_IEEE(__,_x, _R1,_x, _R1,_x), + ALU_DOT4_IEEE(__,_y, _R1,_y, _R1,_y), + ALU_DOT4_IEEE(__,_z, _R1,_z, _R1,_z), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL_IEEE(__,_x, _R0,_z, _R0,_z) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 162 */ + ALU_DOT4_IEEE(__,_x, _R0,_x, _R0,_x), + ALU_DOT4_IEEE(__,_y, _R0,_y, _R0,_y), + ALU_DOT4_IEEE(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4_IEEE(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 163 */ + ALU_MUL(_R127,_x, _R1,_x, ALU_SRC_PS,_x), + ALU_MUL(_R127,_y, _R1,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R1,_z, ALU_SRC_PS,_x), + ALU_RECIPSQRT_IEEE(__,_x, ALU_SRC_PV,_x) SCL_210 + ALU_LAST, + /* 164 */ + ALU_MUL(_R126,_x, _R0,_x, ALU_SRC_PS,_x), + ALU_MUL(_R126,_y, _R0,_y, ALU_SRC_PS,_x), + ALU_MUL(__,_z, _R0,_z, ALU_SRC_PS,_x), + ALU_MUL(__,_x, _R6,_z, ALU_SRC_PV,_z) + ALU_LAST, + /* 165 */ + ALU_DOT4(__,_x, _R16,_x, _R127,_x), + ALU_DOT4(__,_y, _R16,_y, _R127,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_MUL(__,_x, _R6,_z, ALU_SRC_PV,_z) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 166 */ + ALU_DOT4(__,_x, _R16,_x, _R126,_x), + ALU_DOT4(__,_y, _R16,_y, _R126,_y), + ALU_DOT4(__,_z, ALU_SRC_PS,_x, ALU_SRC_1,_x), + ALU_DOT4(__,_w, ALU_SRC_LITERAL,_x, ALU_SRC_0,_x), + ALU_ADD_D2(__,_x, ALU_SRC_PV,_x, ALU_SRC_1,_x) + ALU_LAST, + ALU_LITERAL(0x80000000), + /* 167 */ + ALU_MUL(_R9,_x, ALU_SRC_PS,_x, KC0(1),_x), + ALU_ADD_D2(__,_w, ALU_SRC_PV,_x, ALU_SRC_1,_x) + ALU_LAST, + /* 168 */ + ALU_MUL(_R9,_y, ALU_SRC_PV,_w, KC0(1),_y), + ALU_MOV(_R9,_z, ALU_SRC_LITERAL,_x) + ALU_LAST, + ALU_LITERAL(0x3F800000), + }, + { + /* 169 */ + ALU_ADD(_R0,_x, _R6,_y, KC0(3),_x) + ALU_LAST, + /* 170 */ + ALU_PRED_SETNE_INT(__,_x, KC1(10),_y, ALU_SRC_0,_x) UPDATE_EXEC_MASK(DEACTIVATE) UPDATE_PRED + ALU_LAST, + }, + { + /* 171 */ + ALU_RECIP_IEEE(__,_x, _R12,_w) SCL_210 + ALU_LAST, + /* 172 */ + ALU_MUL_IEEE(__,_y, _R12,_z, ALU_SRC_PS,_x) + ALU_LAST, + /* 173 */ + ALU_MUL(__,_x, ALU_SRC_PV,_y, KC0(2),_x) + ALU_LAST, + /* 174 */ + ALU_ADD(__,_z, KC0(2),_y, ALU_SRC_PV,_x) + ALU_LAST, + /* 175 */ + ALU_FLOOR(__,_w, ALU_SRC_PV,_z) + ALU_LAST, + /* 176 */ + ALU_ADD(__,_y, KC0(2) _NEG,_z, ALU_SRC_PV,_w) + ALU_LAST, + /* 177 */ + ALU_MUL(__,_x, KC0(2),_w, ALU_SRC_PV,_y) + ALU_LAST, + /* 178 */ + ALU_MUL(_R12,_z, _R12,_w, ALU_SRC_PV,_x) + ALU_LAST, + }, + { + /* 179 */ + ALU_NOP(__,_x), + ALU_MUL(_R0,_x, KC0(3),_y, _R0,_x) + ALU_LAST, + }, + { + VTX_FETCH(_R5,_x,_y,_z,_w, _R0,_z, (131), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R6,_x,_y,_z,_w, _R0,_w, (131), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R7,_x,_y,_z,_w, _R0,_y, (131), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R0,_m,_m,_z,_m, _R4,_w, (132), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R17,_x,_y,_m,_m, _R4,_w, (132), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R15,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_y, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_m,_m, _R0,_z, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_y, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_z, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, + { + VTX_FETCH(_R1,_x,_y,_z,_m, _R0,_w, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + VTX_FETCH(_R0,_x,_y,_z,_m, _R0,_x, (130), FETCH_TYPE(NO_INDEX_OFFSET), MEGA(16), OFFSET(0)), + }, +}; + +GX2VertexShader VShaderHWSkinGX2 = { + { + .sq_pgm_resources_vs.num_gprs = 26, + .sq_pgm_resources_vs.stack_size = 3, + .spi_vs_out_config.vs_export_count = 3, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0x01, .semantic_2 = 0x03, .semantic_3 = 0x02 }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x3F, + .num_sq_vtx_semantic = 6, + { + 2, 4, 0, 1, 5, 6, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(VShaderHWSkinCode), + .program = (u8 *)&VShaderHWSkinCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .loopVarCount = countof(VShaderLoopVars), VShaderLoopVars, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, + +}; diff --git a/GPU/GX2/GX2Shaders.h b/GPU/GX2/GX2Shaders.h new file mode 100644 index 000000000000..4b0c43032942 --- /dev/null +++ b/GPU/GX2/GX2Shaders.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern GX2VertexShader defVShaderGX2; +extern GX2PixelShader defPShaderGX2; + +extern GX2VertexShader stencilUploadVSshaderGX2; +extern GX2PixelShader stencilUploadPSshaderGX2; + +extern GX2VertexShader STVshaderGX2; +extern GX2PixelShader STPshaderGX2; + +extern GX2PixelShader PShaderAllGX2; + +extern GX2VertexShader VShaderSWGX2; +extern GX2VertexShader VShaderHWNoSkinGX2; +extern GX2VertexShader VShaderHWSkinGX2; + +#ifdef __cplusplus +} +#endif diff --git a/GPU/GX2/GX2Util.cpp b/GPU/GX2/GX2Util.cpp new file mode 100644 index 000000000000..35139a429222 --- /dev/null +++ b/GPU/GX2/GX2Util.cpp @@ -0,0 +1,42 @@ +#include "ppsspp_config.h" + +#include +#include +#include +#include + +#include "base/stringutil.h" + +#include "GX2Util.h" + +GX2DepthStencilControlReg StockGX2::depthStencilDisabled; +GX2DepthStencilControlReg StockGX2::depthDisabledStencilWrite; +GX2TargetChannelMaskReg StockGX2::TargetChannelMasks[16]; +GX2StencilMaskReg StockGX2::stencilMask; +GX2ColorControlReg StockGX2::blendDisabledColorWrite; +GX2ColorControlReg StockGX2::blendColorDisabled; +GX2Sampler StockGX2::samplerPoint2DWrap; +GX2Sampler StockGX2::samplerLinear2DWrap; +GX2Sampler StockGX2::samplerPoint2DClamp; +GX2Sampler StockGX2::samplerLinear2DClamp; + +void StockGX2::Init() { + GX2InitColorControlReg(&blendDisabledColorWrite, GX2_LOGIC_OP_COPY, 0x00, GX2_DISABLE, GX2_ENABLE); + GX2InitColorControlReg(&blendColorDisabled, GX2_LOGIC_OP_COPY, 0x00, GX2_DISABLE, GX2_DISABLE); + for(int i = 0; i < countof(TargetChannelMasks); i++) + GX2InitTargetChannelMasksReg(TargetChannelMasks + i, (GX2ChannelMask)i, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0); + + GX2InitDepthStencilControlReg(&depthStencilDisabled, GX2_DISABLE, GX2_DISABLE, GX2_COMPARE_FUNC_NEVER, GX2_DISABLE, GX2_DISABLE, GX2_COMPARE_FUNC_NEVER, GX2_STENCIL_FUNCTION_KEEP, GX2_STENCIL_FUNCTION_KEEP, GX2_STENCIL_FUNCTION_KEEP, GX2_COMPARE_FUNC_NEVER, GX2_STENCIL_FUNCTION_KEEP, GX2_STENCIL_FUNCTION_KEEP, GX2_STENCIL_FUNCTION_KEEP); + GX2InitDepthStencilControlReg(&depthDisabledStencilWrite, GX2_DISABLE, GX2_DISABLE, GX2_COMPARE_FUNC_ALWAYS, GX2_ENABLE, GX2_ENABLE, GX2_COMPARE_FUNC_ALWAYS, GX2_STENCIL_FUNCTION_REPLACE, GX2_STENCIL_FUNCTION_REPLACE, GX2_STENCIL_FUNCTION_REPLACE, GX2_COMPARE_FUNC_ALWAYS, GX2_STENCIL_FUNCTION_REPLACE, GX2_STENCIL_FUNCTION_REPLACE, GX2_STENCIL_FUNCTION_REPLACE); + GX2InitStencilMaskReg(&stencilMask, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); + + GX2InitSampler(&samplerPoint2DWrap, GX2_TEX_CLAMP_MODE_WRAP, GX2_TEX_XY_FILTER_MODE_POINT); + GX2InitSampler(&samplerPoint2DClamp, GX2_TEX_CLAMP_MODE_CLAMP, GX2_TEX_XY_FILTER_MODE_POINT); + GX2InitSampler(&samplerLinear2DWrap, GX2_TEX_CLAMP_MODE_WRAP, GX2_TEX_XY_FILTER_MODE_LINEAR); + GX2InitSampler(&samplerLinear2DClamp, GX2_TEX_CLAMP_MODE_CLAMP, GX2_TEX_XY_FILTER_MODE_LINEAR); + + GX2InitSamplerBorderType(&samplerPoint2DWrap, GX2_TEX_BORDER_TYPE_WHITE); + GX2InitSamplerBorderType(&samplerPoint2DClamp, GX2_TEX_BORDER_TYPE_WHITE); + GX2InitSamplerBorderType(&samplerLinear2DWrap, GX2_TEX_BORDER_TYPE_WHITE); + GX2InitSamplerBorderType(&samplerLinear2DClamp, GX2_TEX_BORDER_TYPE_WHITE); +} diff --git a/GPU/GX2/GX2Util.h b/GPU/GX2/GX2Util.h new file mode 100644 index 000000000000..917c50a8dbc6 --- /dev/null +++ b/GPU/GX2/GX2Util.h @@ -0,0 +1,91 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include +#include +#include +#include + +#define aligndown(x, y) ((x) & ~((y) - 1)) +#define alignup(x, y) aligndown(((x) + ((y) - 1)), y) + +class PushBufferGX2 { +public: + PushBufferGX2(u32 size, u32 align, GX2InvalidateMode mode) : size_(alignup(size, align)), align_(align), mode_(mode) { + buffer_ = (u8 *)MEM2_alloc(size_, align_); + } + PushBufferGX2(PushBufferGX2 &) = delete; + ~PushBufferGX2() { + MEM2_free(buffer_); + } + void *Buf() const { + return buffer_; + } + + // Should be done each frame + void Reset() { + pos_ = 0; + push_size_ = 0; + } + + u8 *BeginPush(u32 *offset, u32 size) { + size = alignup(size, align_); + _assert_(size <= size_); + if (pos_ + push_size_ + size > size_) { + // Wrap! Note that with this method, since we return the same buffer as before, you have to do the draw immediately after. + EndPush(); + pos_ = 0; + } + if(offset) + *offset = pos_; + push_size_ += size; + return (u8 *)buffer_ + pos_ + push_size_ - size; + } + void EndPush() { + if(push_size_) { + GX2Invalidate(mode_, buffer_ + pos_, push_size_); + pos_ += push_size_; + push_size_ = 0; + } + } + +private: + u32 size_; + u32 align_; + GX2InvalidateMode mode_; + u8 *buffer_; + u32 pos_ = 0; + u32 push_size_ = 0; +}; + +class StockGX2 { +public: + static void Init(); + static GX2DepthStencilControlReg depthStencilDisabled; + static GX2DepthStencilControlReg depthDisabledStencilWrite; + static GX2TargetChannelMaskReg TargetChannelMasks[16]; + static GX2StencilMaskReg stencilMask; + static GX2ColorControlReg blendDisabledColorWrite; + static GX2ColorControlReg blendColorDisabled; + static GX2Sampler samplerPoint2DWrap; + static GX2Sampler samplerLinear2DWrap; + static GX2Sampler samplerPoint2DClamp; + static GX2Sampler samplerLinear2DClamp; +}; diff --git a/GPU/GX2/ShaderManagerGX2.cpp b/GPU/GX2/ShaderManagerGX2.cpp new file mode 100644 index 000000000000..289f225ae9a2 --- /dev/null +++ b/GPU/GX2/ShaderManagerGX2.cpp @@ -0,0 +1,343 @@ +// Copyright (c) 2015- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "ppsspp_config.h" + +#include +#include +#include +#include + +#include "math/lin/matrix4x4.h" +#include "math/math_util.h" +#include "math/dataconv.h" +#include "thin3d/thin3d.h" +#include "util/text/utf8.h" +#include "Common/Common.h" +#include "Core/Config.h" +#include "Core/Reporting.h" +#include "GPU/Math3D.h" +#include "GPU/GPUState.h" +#include "GPU/ge_constants.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/FragmentShaderGeneratorGX2.h" +#include "GPU/GX2/VertexShaderGeneratorGX2.h" +#include "GPU/GX2/GX2Util.h" +#include "GPU/Vulkan/FragmentShaderGeneratorVulkan.h" +#include "GPU/Vulkan/VertexShaderGeneratorVulkan.h" + +GX2PShader::GX2PShader(FShaderID id) : GX2PixelShader(), id_(id) { + GenerateFragmentShaderGX2(id, this); + ub_id = (UB_FSID *)MEM2_alloc(sizeof(UB_FSID), GX2_UNIFORM_BLOCK_ALIGNMENT); + memset(ub_id, 0, sizeof(UB_FSID)); + ub_id->FS_BIT_CLEARMODE = id.Bit(FS_BIT_CLEARMODE); + ub_id->FS_BIT_DO_TEXTURE = id.Bit(FS_BIT_DO_TEXTURE); + ub_id->FS_BIT_TEXFUNC = id.Bits(FS_BIT_TEXFUNC, 3); + ub_id->FS_BIT_TEXALPHA = id.Bit(FS_BIT_TEXALPHA); + ub_id->FS_BIT_SHADER_DEPAL = id.Bit(FS_BIT_SHADER_DEPAL); + ub_id->FS_BIT_SHADER_TEX_CLAMP = id.Bit(FS_BIT_SHADER_TEX_CLAMP); + ub_id->FS_BIT_CLAMP_S = id.Bit(FS_BIT_CLAMP_S); + ub_id->FS_BIT_CLAMP_T = id.Bit(FS_BIT_CLAMP_T); + ub_id->FS_BIT_TEXTURE_AT_OFFSET = id.Bit(FS_BIT_TEXTURE_AT_OFFSET); + ub_id->FS_BIT_LMODE = id.Bit(FS_BIT_LMODE); + ub_id->FS_BIT_ALPHA_TEST = id.Bit(FS_BIT_ALPHA_TEST); + ub_id->FS_BIT_ALPHA_TEST_FUNC = id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3); + ub_id->FS_BIT_ALPHA_AGAINST_ZERO = id.Bit(FS_BIT_ALPHA_AGAINST_ZERO); + ub_id->FS_BIT_COLOR_TEST = id.Bit(FS_BIT_COLOR_TEST); + ub_id->FS_BIT_COLOR_TEST_FUNC = id.Bits(FS_BIT_COLOR_TEST_FUNC, 2); + ub_id->FS_BIT_COLOR_AGAINST_ZERO = id.Bit(FS_BIT_COLOR_AGAINST_ZERO); + ub_id->FS_BIT_ENABLE_FOG = id.Bit(FS_BIT_ENABLE_FOG); + ub_id->FS_BIT_DO_TEXTURE_PROJ = id.Bit(FS_BIT_DO_TEXTURE_PROJ); + ub_id->FS_BIT_COLOR_DOUBLE = id.Bit(FS_BIT_COLOR_DOUBLE); + ub_id->FS_BIT_STENCIL_TO_ALPHA = id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2); + ub_id->FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE = id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4); + ub_id->FS_BIT_REPLACE_LOGIC_OP_TYPE = id.Bits(FS_BIT_REPLACE_LOGIC_OP_TYPE, 2); + ub_id->FS_BIT_REPLACE_BLEND = id.Bits(FS_BIT_REPLACE_BLEND, 3); + ub_id->FS_BIT_BLENDEQ = id.Bits(FS_BIT_BLENDEQ, 3); + ub_id->FS_BIT_BLENDFUNC_A = id.Bits(FS_BIT_BLENDFUNC_A, 4); + ub_id->FS_BIT_BLENDFUNC_B = id.Bits(FS_BIT_BLENDFUNC_B, 4); + ub_id->FS_BIT_FLATSHADE = id.Bit(FS_BIT_FLATSHADE); + ub_id->FS_BIT_BGRA_TEXTURE = id.Bit(FS_BIT_BGRA_TEXTURE); + ub_id->GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = gstate_c.Supports(GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT); + ub_id->GPU_SUPPORTS_DEPTH_CLAMP = gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP); + ub_id->GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT = gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT); + ub_id->GPU_SUPPORTS_ACCURATE_DEPTH = gstate_c.Supports(GPU_SUPPORTS_ACCURATE_DEPTH); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK, ub_id, sizeof(UB_FSID)); +} + +std::string GX2PShader::GetShaderString(DebugShaderStringType type) const { + switch (type) { + case SHADER_STRING_SHORT_DESC: return FragmentShaderDesc(id_); + case SHADER_STRING_SOURCE_CODE: { + char buffer[0x20000]; + DisassembleGX2Shader(program, size, buffer); + sprintf(buffer + strlen(buffer), "\n### GPU Regs ###\n"); + GX2PixelShaderInfo(this, buffer + strlen(buffer)); + sprintf(buffer + strlen(buffer), "\n### glsl ###\n"); + GenerateVulkanGLSLFragmentShader(id_, buffer + strlen(buffer), 0); + _assert_(strlen(buffer) < sizeof(buffer)); + return buffer; + } + default: return "N/A"; + } +} + +GX2VShader::GX2VShader(VShaderID id) : GX2VertexShader(), id_(id) { + GenerateVertexShaderGX2(id, this); + ub_id = (UB_VSID *)MEM2_alloc(sizeof(UB_VSID), GX2_UNIFORM_BLOCK_ALIGNMENT); + memset(ub_id, 0, sizeof(UB_VSID)); + ub_id->VS_BIT_LMODE = id.Bit(VS_BIT_LMODE); + ub_id->VS_BIT_IS_THROUGH = id.Bit(VS_BIT_IS_THROUGH); + ub_id->VS_BIT_ENABLE_FOG = id.Bit(VS_BIT_ENABLE_FOG); + ub_id->VS_BIT_HAS_COLOR = id.Bit(VS_BIT_HAS_COLOR); + ub_id->VS_BIT_DO_TEXTURE = id.Bit(VS_BIT_DO_TEXTURE); + ub_id->VS_BIT_DO_TEXTURE_TRANSFORM = id.Bit(VS_BIT_DO_TEXTURE_TRANSFORM); + ub_id->VS_BIT_USE_HW_TRANSFORM = id.Bit(VS_BIT_USE_HW_TRANSFORM); + ub_id->VS_BIT_HAS_NORMAL = id.Bit(VS_BIT_HAS_NORMAL); + ub_id->VS_BIT_NORM_REVERSE = id.Bit(VS_BIT_NORM_REVERSE); + ub_id->VS_BIT_HAS_TEXCOORD = id.Bit(VS_BIT_HAS_TEXCOORD); + ub_id->VS_BIT_HAS_COLOR_TESS = id.Bit(VS_BIT_HAS_COLOR_TESS); + ub_id->VS_BIT_HAS_TEXCOORD_TESS = id.Bit(VS_BIT_HAS_TEXCOORD_TESS); + ub_id->VS_BIT_NORM_REVERSE_TESS = id.Bit(VS_BIT_NORM_REVERSE_TESS); + ub_id->VS_BIT_UVGEN_MODE = id.Bit(VS_BIT_UVGEN_MODE); + ub_id->VS_BIT_UVPROJ_MODE = id.Bits(VS_BIT_UVPROJ_MODE, 2); + ub_id->VS_BIT_LS0 = id.Bits(VS_BIT_LS0, 2); + ub_id->VS_BIT_LS1 = id.Bits(VS_BIT_LS1, 2); + ub_id->VS_BIT_BONES = id.Bits(VS_BIT_BONES, 3); + ub_id->VS_BIT_ENABLE_BONES = id.Bit(VS_BIT_ENABLE_BONES); + ub_id->VS_BIT_LIGHT[0].COMP = id.Bits(VS_BIT_LIGHT0_COMP, 2); + ub_id->VS_BIT_LIGHT[0].TYPE = id.Bits(VS_BIT_LIGHT0_TYPE, 2); + ub_id->VS_BIT_LIGHT[1].COMP = id.Bits(VS_BIT_LIGHT1_COMP, 2); + ub_id->VS_BIT_LIGHT[1].TYPE = id.Bits(VS_BIT_LIGHT1_TYPE, 2); + ub_id->VS_BIT_LIGHT[2].COMP = id.Bits(VS_BIT_LIGHT2_COMP, 2); + ub_id->VS_BIT_LIGHT[2].TYPE = id.Bits(VS_BIT_LIGHT2_TYPE, 2); + ub_id->VS_BIT_LIGHT[3].COMP = id.Bits(VS_BIT_LIGHT3_COMP, 2); + ub_id->VS_BIT_LIGHT[3].TYPE = id.Bits(VS_BIT_LIGHT3_TYPE, 2); + ub_id->VS_BIT_MATERIAL_UPDATE = id.Bits(VS_BIT_MATERIAL_UPDATE, 3); + ub_id->VS_BIT_SPLINE = id.Bit(VS_BIT_SPLINE); + ub_id->VS_BIT_LIGHT[0].ENABLE = id.Bit(VS_BIT_LIGHT0_ENABLE); + ub_id->VS_BIT_LIGHT[1].ENABLE = id.Bit(VS_BIT_LIGHT1_ENABLE); + ub_id->VS_BIT_LIGHT[2].ENABLE = id.Bit(VS_BIT_LIGHT2_ENABLE); + ub_id->VS_BIT_LIGHT[3].ENABLE = id.Bit(VS_BIT_LIGHT3_ENABLE); + ub_id->VS_BIT_LIGHTING_ENABLE = id.Bit(VS_BIT_LIGHTING_ENABLE); + ub_id->VS_BIT_WEIGHT_FMTSCALE = id.Bits(VS_BIT_WEIGHT_FMTSCALE, 2); + ub_id->VS_BIT_FLATSHADE = id.Bit(VS_BIT_FLATSHADE); + ub_id->VS_BIT_BEZIER = id.Bit(VS_BIT_BEZIER); + ub_id->GPU_ROUND_DEPTH_TO_16BIT = gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK, ub_id, sizeof(UB_VSID)); +} + +std::string GX2VShader::GetShaderString(DebugShaderStringType type) const { + switch (type) { + case SHADER_STRING_SHORT_DESC: return VertexShaderDesc(id_); + case SHADER_STRING_SOURCE_CODE: + { + char buffer[0x20000]; + DisassembleGX2Shader(program, size, buffer); + sprintf(buffer + strlen(buffer), "\n### GPU Regs ###\n"); + GX2VertexShaderInfo(this, buffer + strlen(buffer)); + sprintf(buffer + strlen(buffer), "\n### glsl ###\n"); + GenerateVulkanGLSLVertexShader(id_, buffer + strlen(buffer)); + _assert_(strlen(buffer) < sizeof(buffer)); + return buffer; + } + default: return "N/A"; + } +} + +ShaderManagerGX2::ShaderManagerGX2(Draw::DrawContext *draw, GX2ContextState *context) : + ShaderManagerCommon(draw), lastVShader_(nullptr), lastFShader_(nullptr) { + memset(&ub_base, 0, sizeof(ub_base)); + memset(&ub_lights, 0, sizeof(ub_lights)); + memset(&ub_bones, 0, sizeof(ub_bones)); + + INFO_LOG(G3D, "sizeof(ub_base): %d", (int)sizeof(ub_base)); + INFO_LOG(G3D, "sizeof(ub_lights): %d", (int)sizeof(ub_lights)); + INFO_LOG(G3D, "sizeof(ub_bones): %d", (int)sizeof(ub_bones)); +} + +ShaderManagerGX2::~ShaderManagerGX2() { + ClearShaders(); +} + +void ShaderManagerGX2::Clear() { + for (auto iter = fsCache_.begin(); iter != fsCache_.end(); ++iter) { + delete iter->second; + } + for (auto iter = vsCache_.begin(); iter != vsCache_.end(); ++iter) { + delete iter->second; + } + fsCache_.clear(); + vsCache_.clear(); + lastFSID_.set_invalid(); + lastVSID_.set_invalid(); + gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE); +} + +void ShaderManagerGX2::ClearShaders() { + Clear(); + DirtyLastShader(); + gstate_c.Dirty(DIRTY_ALL_UNIFORMS); +} + +void ShaderManagerGX2::DirtyLastShader() { + lastFSID_.set_invalid(); + lastVSID_.set_invalid(); + lastVShader_ = nullptr; + lastFShader_ = nullptr; + gstate_c.Dirty(DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE); +} + +uint64_t ShaderManagerGX2::UpdateUniforms(PushBufferGX2 *push, bool useBufferedRendering) { + uint64_t dirty = gstate_c.GetDirtyUniforms(); + if (dirty != 0) { + if (dirty & DIRTY_BASE_UNIFORMS) { + BaseUpdateUniforms(&ub_base, dirty, true, useBufferedRendering); + u32_le *push_base = (u32_le *)push->BeginPush(nullptr, sizeof(ub_base)); + for (int i = 0; i < sizeof(ub_base) / 4; i++) + push_base[i] = ((u32 *)&ub_base)[i]; + push->EndPush(); + GX2SetVertexUniformBlock(1, sizeof(ub_base), push_base); + GX2SetPixelUniformBlock(1, sizeof(ub_base), push_base); + } + if (dirty & DIRTY_LIGHT_UNIFORMS) { + LightUpdateUniforms(&ub_lights, dirty); + u32_le *push_lights = (u32_le *)push->BeginPush(nullptr, sizeof(ub_lights)); + for (int i = 0; i < sizeof(ub_lights) / 4; i++) + push_lights[i] = ((u32 *)&ub_lights)[i]; + push->EndPush(); + GX2SetVertexUniformBlock(2, sizeof(ub_lights), push_lights); + } + if (dirty & DIRTY_BONE_UNIFORMS) { + BoneUpdateUniforms(&ub_bones, dirty); + u32_le *push_bones = (u32_le *)push->BeginPush(nullptr, sizeof(ub_bones)); + for (int i = 0; i < sizeof(ub_bones) / 4; i++) + push_bones[i] = ((u32 *)&ub_bones)[i]; + push->EndPush(); + GX2SetVertexUniformBlock(3, sizeof(ub_bones), push_bones); + } + } + gstate_c.CleanUniforms(); + return dirty; +} + +void ShaderManagerGX2::GetShaders(int prim, u32 vertType, GX2VShader **vshader, GX2PShader **fshader, + bool useHWTransform, bool useHWTessellation) { + VShaderID VSID; + FShaderID FSID; + + if (gstate_c.IsDirty(DIRTY_VERTEXSHADER_STATE)) { + gstate_c.Clean(DIRTY_VERTEXSHADER_STATE); + ComputeVertexShaderID(&VSID, vertType, useHWTransform, useHWTessellation); + } else { + VSID = lastVSID_; + } + + if (gstate_c.IsDirty(DIRTY_FRAGMENTSHADER_STATE)) { + gstate_c.Clean(DIRTY_FRAGMENTSHADER_STATE); + ComputeFragmentShaderID(&FSID, draw_->GetBugs()); + } else { + FSID = lastFSID_; + } + + // Just update uniforms if this is the same shader as last time. + if (lastVShader_ != nullptr && lastFShader_ != nullptr && VSID == lastVSID_ && FSID == lastFSID_) { + *vshader = lastVShader_; + *fshader = lastFShader_; + // Already all set, no need to look up in shader maps. + return; + } + + VSCache::iterator vsIter = vsCache_.find(VSID); + GX2VShader *vs; + if (vsIter == vsCache_.end()) { + // Vertex shader not in cache. Let's generate it. + vs = new GX2VShader(VSID); + vsCache_[VSID] = vs; + } else { + vs = vsIter->second; + } + lastVSID_ = VSID; + + FSCache::iterator fsIter = fsCache_.find(FSID); + GX2PShader *fs; + if (fsIter == fsCache_.end()) { + // Fragment shader not in cache. Let's generate it. + fs = new GX2PShader(FSID); + fsCache_[FSID] = fs; + } else { + fs = fsIter->second; + } + + lastFSID_ = FSID; + + lastVShader_ = vs; + lastFShader_ = fs; + + *vshader = vs; + *fshader = fs; + + GX2SetVertexUniformBlock(4, sizeof(UB_VSID), vs->ub_id); + GX2SetPixelUniformBlock(5, sizeof(UB_FSID), fs->ub_id); +} + +std::vector ShaderManagerGX2::DebugGetShaderIDs(DebugShaderType type) { + std::string id; + std::vector ids; + switch (type) { + case SHADER_TYPE_VERTEX: { + for (auto iter : vsCache_) { + iter.first.ToString(&id); + ids.push_back(id); + } + break; + } + case SHADER_TYPE_FRAGMENT: { + for (auto iter : fsCache_) { + iter.first.ToString(&id); + ids.push_back(id); + } + break; + } + default: break; + } + return ids; +} + +std::string ShaderManagerGX2::DebugGetShaderString(std::string id, DebugShaderType type, + DebugShaderStringType stringType) { + ShaderID shaderId; + shaderId.FromString(id); + switch (type) { + case SHADER_TYPE_VERTEX: { + auto iter = vsCache_.find(VShaderID(shaderId)); + if (iter == vsCache_.end()) { + return ""; + } + return iter->second->GetShaderString(stringType); + } + + case SHADER_TYPE_FRAGMENT: { + auto iter = fsCache_.find(FShaderID(shaderId)); + if (iter == fsCache_.end()) { + return ""; + } + return iter->second->GetShaderString(stringType); + } + default: return "N/A"; + } +} diff --git a/GPU/GX2/ShaderManagerGX2.h b/GPU/GX2/ShaderManagerGX2.h new file mode 100644 index 000000000000..1cfd27434cfb --- /dev/null +++ b/GPU/GX2/ShaderManagerGX2.h @@ -0,0 +1,211 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include +#include +#include + +#include "Common/Swap.h" +#include "base/basictypes.h" +#include "GPU/Common/ShaderCommon.h" +#include "GPU/Common/ShaderId.h" +#include "GPU/Common/ShaderUniforms.h" + +#include "GPU/GX2/GX2Shaders.h" +#include "GPU/GX2/GX2Util.h" + +namespace GX2Gen { + +enum class VSInput : u32 { + POSITION, + COORDS, + COLOR0, + COLOR1, + NORMAL, + WEIGHT0, + WEIGHT1, +}; +enum class PSInput : u32 { + COLOR0, + COLOR1, + FOGDEPTH, + COORDS, +}; + +enum class UB_Bindings : u32 { + Reserved, + Base, + Lights, + Bones, +}; + +} // namespace GX2Gen + +struct UB_VSID { + struct { + u32_le COMP; + u32_le TYPE; + u32_le ENABLE; + u32_le pad_; + } VS_BIT_LIGHT[4]; + u32_le VS_BIT_LMODE; + u32_le VS_BIT_IS_THROUGH; + u32_le VS_BIT_ENABLE_FOG; + u32_le VS_BIT_HAS_COLOR; + u32_le VS_BIT_DO_TEXTURE; + u32_le VS_BIT_DO_TEXTURE_TRANSFORM; + u32_le VS_BIT_USE_HW_TRANSFORM; + u32_le VS_BIT_HAS_NORMAL; + u32_le VS_BIT_NORM_REVERSE; + u32_le VS_BIT_HAS_TEXCOORD; + u32_le VS_BIT_HAS_COLOR_TESS; + u32_le VS_BIT_HAS_TEXCOORD_TESS; + u32_le VS_BIT_NORM_REVERSE_TESS; + u32_le VS_BIT_UVGEN_MODE; + u32_le VS_BIT_UVPROJ_MODE; + u32_le VS_BIT_LS0; + u32_le VS_BIT_LS1; + u32_le VS_BIT_BONES; + u32_le VS_BIT_ENABLE_BONES; + u32_le VS_BIT_MATERIAL_UPDATE; + u32_le VS_BIT_SPLINE; + u32_le VS_BIT_LIGHTING_ENABLE; + u32_le VS_BIT_WEIGHT_FMTSCALE; + u32_le VS_BIT_FLATSHADE; + u32_le VS_BIT_BEZIER; + u32_le GPU_ROUND_DEPTH_TO_16BIT; +} __attribute__((aligned(64))); + +struct UB_FSID { + u32_le FS_BIT_CLEARMODE; + u32_le FS_BIT_DO_TEXTURE; + u32_le FS_BIT_TEXFUNC; + u32_le FS_BIT_TEXALPHA; + u32_le FS_BIT_SHADER_DEPAL; + u32_le FS_BIT_SHADER_TEX_CLAMP; + u32_le FS_BIT_CLAMP_S; + u32_le FS_BIT_CLAMP_T; + u32_le FS_BIT_TEXTURE_AT_OFFSET; + u32_le FS_BIT_LMODE; + u32_le FS_BIT_ALPHA_TEST; + u32_le FS_BIT_ALPHA_TEST_FUNC; + u32_le FS_BIT_ALPHA_AGAINST_ZERO; + u32_le FS_BIT_COLOR_TEST; + u32_le FS_BIT_COLOR_TEST_FUNC; + u32_le FS_BIT_COLOR_AGAINST_ZERO; + u32_le FS_BIT_ENABLE_FOG; + u32_le FS_BIT_DO_TEXTURE_PROJ; + u32_le FS_BIT_COLOR_DOUBLE; + u32_le FS_BIT_STENCIL_TO_ALPHA; + u32_le FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE; + u32_le FS_BIT_REPLACE_LOGIC_OP_TYPE; + u32_le FS_BIT_REPLACE_BLEND; + u32_le FS_BIT_BLENDEQ; + u32_le FS_BIT_BLENDFUNC_A; + u32_le FS_BIT_BLENDFUNC_B; + u32_le FS_BIT_FLATSHADE; + u32_le FS_BIT_BGRA_TEXTURE; + u32_le GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT; + u32_le GPU_SUPPORTS_DEPTH_CLAMP; + u32_le GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT; + u32_le GPU_SUPPORTS_ACCURATE_DEPTH; +} __attribute__((aligned(64))); + +class GX2PShader : public GX2PixelShader { +public: + GX2PShader(FShaderID id); + ~GX2PShader() { + MEM2_free(ub_id); + if (!(gx2rBuffer.flags & GX2R_RESOURCE_LOCKED_READ_ONLY)) + MEM2_free(program); + } + + const std::string source() const { return "N/A"; } + const u8 *bytecode() const { return program; } + std::string GetShaderString(DebugShaderStringType type) const; + + UB_FSID *ub_id; + +protected: + FShaderID id_; +}; + +class GX2VShader : public GX2VertexShader { +public: + GX2VShader(VShaderID id); + ~GX2VShader() { + MEM2_free(ub_id); + if (!(gx2rBuffer.flags & GX2R_RESOURCE_LOCKED_READ_ONLY)) + MEM2_free(program); + } + + const std::string source() const { return "N/A"; } + const u8 *bytecode() const { return program; } + std::string GetShaderString(DebugShaderStringType type) const; + + UB_VSID *ub_id; + +protected: + VShaderID id_; +}; + +class ShaderManagerGX2 : public ShaderManagerCommon { +public: + ShaderManagerGX2(Draw::DrawContext *draw, GX2ContextState *context); + ~ShaderManagerGX2(); + + void GetShaders(int prim, u32 vertType, GX2VShader **vshader, GX2PShader **fshader, bool useHWTransform, bool useHWTessellation); + void ClearShaders(); + void DirtyLastShader() override; + + int GetNumVertexShaders() const { return (int)vsCache_.size(); } + int GetNumFragmentShaders() const { return (int)fsCache_.size(); } + + std::vector DebugGetShaderIDs(DebugShaderType type); + std::string DebugGetShaderString(std::string id, DebugShaderType type, DebugShaderStringType stringType); + + uint64_t UpdateUniforms(PushBufferGX2 *push, bool useBufferedRendering); + + // TODO: Avoid copying these buffers if same as last draw, can still point to it assuming we're still in the same pushbuffer. + // Applies dirty changes and copies the buffer. + bool IsBaseDirty() { return true; } + bool IsLightDirty() { return true; } + bool IsBoneDirty() { return true; } + +private: + void Clear(); + + typedef std::map FSCache; + FSCache fsCache_; + + typedef std::map VSCache; + VSCache vsCache_; + + // Uniform block scratchpad. These (the relevant ones) are copied to the current pushbuffer at draw time. + UB_VS_FS_Base ub_base; + UB_VS_Lights ub_lights; + UB_VS_Bones ub_bones; + + GX2PShader *lastFShader_; + GX2VShader *lastVShader_; + + FShaderID lastFSID_; + VShaderID lastVSID_; +}; diff --git a/GPU/GX2/StateMappingGX2.cpp b/GPU/GX2/StateMappingGX2.cpp new file mode 100644 index 000000000000..1148cba8ae02 --- /dev/null +++ b/GPU/GX2/StateMappingGX2.cpp @@ -0,0 +1,434 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "math/dataconv.h" + +#include "GPU/Math3D.h" +#include "GPU/GPUState.h" +#include "GPU/ge_constants.h" +#include "GPU/Common/GPUStateUtils.h" +#include "Core/System.h" +#include "Core/Config.h" +#include "Core/Reporting.h" + +#include "profiler/profiler.h" +#include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/GX2/DrawEngineGX2.h" +#include "GPU/GX2/StateMappingGX2.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" + +#include + +// clang-format off +// These tables all fit into u8s. +static const GX2BlendMode GX2BlendFactorLookup[(size_t)BlendFactor::COUNT] = { + GX2_BLEND_MODE_ZERO, + GX2_BLEND_MODE_ONE, + GX2_BLEND_MODE_SRC_COLOR, + GX2_BLEND_MODE_INV_SRC_COLOR, + GX2_BLEND_MODE_DST_COLOR, + GX2_BLEND_MODE_INV_DST_COLOR, + GX2_BLEND_MODE_SRC_ALPHA, + GX2_BLEND_MODE_INV_SRC_ALPHA, + GX2_BLEND_MODE_DST_ALPHA, + GX2_BLEND_MODE_INV_DST_ALPHA, + GX2_BLEND_MODE_BLEND_FACTOR, + GX2_BLEND_MODE_INV_BLEND_FACTOR, + GX2_BLEND_MODE_BLEND_ALPHA, + GX2_BLEND_MODE_INV_BLEND_ALPHA, + GX2_BLEND_MODE_SRC1_COLOR, + GX2_BLEND_MODE_INV_SRC1_COLOR, + GX2_BLEND_MODE_SRC1_ALPHA, + GX2_BLEND_MODE_INV_SRC1_ALPHA, +}; + +static const GX2BlendCombineMode GX2BlendEqLookup[(size_t)BlendEq::COUNT] = { + GX2_BLEND_COMBINE_MODE_ADD, + GX2_BLEND_COMBINE_MODE_SUB, + GX2_BLEND_COMBINE_MODE_REV_SUB, + GX2_BLEND_COMBINE_MODE_MIN, + GX2_BLEND_COMBINE_MODE_MAX, +}; + +static const GX2CompareFunction compareOps[] = { + GX2_COMPARE_FUNC_NEVER, + GX2_COMPARE_FUNC_ALWAYS, + GX2_COMPARE_FUNC_EQUAL, + GX2_COMPARE_FUNC_NOT_EQUAL, + GX2_COMPARE_FUNC_LESS, + GX2_COMPARE_FUNC_LEQUAL, + GX2_COMPARE_FUNC_GREATER, + GX2_COMPARE_FUNC_GEQUAL, +}; + +static const GX2StencilFunction stencilOps[] = { + GX2_STENCIL_FUNCTION_KEEP, + GX2_STENCIL_FUNCTION_ZERO, + GX2_STENCIL_FUNCTION_REPLACE, + GX2_STENCIL_FUNCTION_INV, + GX2_STENCIL_FUNCTION_INCR_CLAMP, + GX2_STENCIL_FUNCTION_DECR_CLAMP, + GX2_STENCIL_FUNCTION_KEEP, // reserved + GX2_STENCIL_FUNCTION_KEEP, // reserved +}; + +static const GX2PrimitiveMode primToGX2[8] = { + GX2_PRIMITIVE_MODE_POINTS, + GX2_PRIMITIVE_MODE_LINES, + GX2_PRIMITIVE_MODE_LINE_STRIP, + GX2_PRIMITIVE_MODE_TRIANGLES, + GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, + GX2_PRIMITIVE_MODE_TRIANGLE_FAN, + GX2_PRIMITIVE_MODE_TRIANGLES, +}; + +static const GX2LogicOp logicOps[] = { + GX2_LOGIC_OP_CLEAR, + GX2_LOGIC_OP_AND, + GX2_LOGIC_OP_REV_AND, + GX2_LOGIC_OP_COPY, + GX2_LOGIC_OP_INV_AND, + GX2_LOGIC_OP_NOP, + GX2_LOGIC_OP_XOR, + GX2_LOGIC_OP_OR, + GX2_LOGIC_OP_NOR, + GX2_LOGIC_OP_EQUIV, + GX2_LOGIC_OP_INV, + GX2_LOGIC_OP_REV_OR, + GX2_LOGIC_OP_INV_COPY, + GX2_LOGIC_OP_INV_OR, + GX2_LOGIC_OP_NOT_AND, + GX2_LOGIC_OP_SET, +}; +// clang-format on + +void DrawEngineGX2::ResetShaderBlending() { + if (fboTexBound_) { + // GX2SetPixelTexture(nullptr, 0); + fboTexBound_ = false; + } +} + +class FramebufferManagerGX2; +class ShaderManagerGX2; + +void DrawEngineGX2::ApplyDrawState(int prim) { + PROFILE_THIS_SCOPE("drawState"); + dynState_.topology = primToGX2[prim]; + + if (!gstate_c.IsDirty(DIRTY_BLEND_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE)) { + // nothing to do + return; + } + + bool useBufferedRendering = framebufferManager_->UseBufferedRendering(); + // Blend + if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) { + gstate_c.SetAllowShaderBlend(!g_Config.bDisableSlowFramebufEffects); + if (gstate.isModeClear()) { + keys_.blend.value = 0; // full wipe + keys_.blend.blendEnable = false; + dynState_.useBlendColor = false; + // Color Test + bool alphaMask = gstate.isClearModeAlphaMask(); + bool colorMask = gstate.isClearModeColorMask(); + keys_.blend.colorWriteMask = (GX2ChannelMask)((colorMask ? (1 | 2 | 4) : 0) | (alphaMask ? 8 : 0)); + } else { + keys_.blend.value = 0; + // Set blend - unless we need to do it in the shader. + GenericBlendState blendState; + ConvertBlendState(blendState, gstate_c.allowShaderBlend); + if (blendState.applyShaderBlending) { + if (ApplyShaderBlending()) { + // We may still want to do something about stencil -> alpha. + ApplyStencilReplaceAndLogicOp(blendState.replaceAlphaWithStencil, blendState); + } else { + // Until next time, force it off. + ResetShaderBlending(); + gstate_c.SetAllowShaderBlend(false); + } + } else if (blendState.resetShaderBlending) { + ResetShaderBlending(); + } + + if (blendState.enabled) { + keys_.blend.blendEnable = true; + keys_.blend.logicOpEnable = false; + keys_.blend.blendOpColor = GX2BlendEqLookup[(size_t)blendState.eqColor]; + keys_.blend.blendOpAlpha = GX2BlendEqLookup[(size_t)blendState.eqAlpha]; + keys_.blend.srcColor = GX2BlendFactorLookup[(size_t)blendState.srcColor]; + keys_.blend.srcAlpha = GX2BlendFactorLookup[(size_t)blendState.srcAlpha]; + keys_.blend.destColor = GX2BlendFactorLookup[(size_t)blendState.dstColor]; + keys_.blend.destAlpha = GX2BlendFactorLookup[(size_t)blendState.dstAlpha]; + if (blendState.dirtyShaderBlend) { + gstate_c.Dirty(DIRTY_SHADERBLEND); + } + dynState_.useBlendColor = blendState.useBlendColor; + if (blendState.useBlendColor) { + dynState_.blendColor = blendState.blendColor; + } + } else { + keys_.blend.blendEnable = false; + dynState_.useBlendColor = false; + } + + if (gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP)) { + // Logic Ops + if (gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY) { + keys_.blend.blendEnable = false; // Can't have both blend & logic op - although I think the PSP can! + keys_.blend.logicOpEnable = true; + keys_.blend.logicOp = logicOps[gstate.getLogicOp()]; + } else { + keys_.blend.logicOpEnable = false; + } + } + + // PSP color/alpha mask is per bit but we can only support per byte. + // But let's do that, at least. And let's try a threshold. + bool rmask = (gstate.pmskc & 0xFF) < 128; + bool gmask = ((gstate.pmskc >> 8) & 0xFF) < 128; + bool bmask = ((gstate.pmskc >> 16) & 0xFF) < 128; + bool amask = (gstate.pmska & 0xFF) < 128; + +#ifndef MOBILE_DEVICE + u8 abits = (gstate.pmska >> 0) & 0xFF; + u8 rbits = (gstate.pmskc >> 0) & 0xFF; + u8 gbits = (gstate.pmskc >> 8) & 0xFF; + u8 bbits = (gstate.pmskc >> 16) & 0xFF; + if ((rbits != 0 && rbits != 0xFF) || (gbits != 0 && gbits != 0xFF) || (bbits != 0 && bbits != 0xFF)) { + WARN_LOG_REPORT_ONCE(rgbmask, G3D, "Unsupported RGB mask: r=%02x g=%02x b=%02x", rbits, gbits, bbits); + } + if (abits != 0 && abits != 0xFF) { + // The stencil part of the mask is supported. + WARN_LOG_REPORT_ONCE(amask, G3D, "Unsupported alpha/stencil mask: %02x", abits); + } +#endif + + // Let's not write to alpha if stencil isn't enabled. + if (IsStencilTestOutputDisabled()) { + amask = false; + } else { + // If the stencil type is set to KEEP, we shouldn't write to the stencil/alpha channel. + if (ReplaceAlphaWithStencilType() == STENCIL_VALUE_KEEP) { + amask = false; + } + } + + keys_.blend.colorWriteMask = (GX2ChannelMask)((rmask ? 1 : 0) | (gmask ? 2 : 0) | (bmask ? 4 : 0) | (amask ? 8 : 0)); + } + + GX2BlendState *bs1 = blendCache_.Get(keys_.blend.value); + if (bs1 == nullptr) { + bs1 = new GX2BlendState; + GX2InitColorControlReg(&bs1->color, keys_.blend.logicOpEnable ? keys_.blend.logicOp : GX2_LOGIC_OP_COPY, keys_.blend.blendEnable ? 0xFF : 0x00, false, keys_.blend.colorWriteMask != 0); + GX2InitTargetChannelMasksReg(&bs1->mask, keys_.blend.colorWriteMask, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0); + GX2InitBlendControlReg(&bs1->blend, GX2_RENDER_TARGET_0, keys_.blend.srcColor, keys_.blend.destColor, keys_.blend.blendOpColor, keys_.blend.srcAlpha && keys_.blend.destAlpha, keys_.blend.srcAlpha, keys_.blend.destAlpha, keys_.blend.blendOpAlpha); + blendCache_.Insert(keys_.blend.value, bs1); + } + blendState_ = bs1; + } + + if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) { + keys_.raster.value = 0; + keys_.raster.frontFace = GX2_FRONT_FACE_CCW; + // Set cull + if (!gstate.isModeClear() && !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled()) { + keys_.raster.cullFront = !!gstate.getCullMode(); + keys_.raster.cullBack = !gstate.getCullMode(); + } else { + keys_.raster.cullFront = GX2_DISABLE; + keys_.raster.cullBack = GX2_DISABLE; + } + GX2RasterizerState *rs = rasterCache_.Get(keys_.raster.value); + if (rs == nullptr) { + rs = new GX2RasterizerState({ keys_.raster.frontFace, keys_.raster.cullFront, keys_.raster.cullBack }); + rasterCache_.Insert(keys_.raster.value, rs); + } + rasterState_ = rs; + } + + if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) { + if (gstate.isModeClear()) { + keys_.depthStencil.value = 0; + keys_.depthStencil.depthTestEnable = true; + keys_.depthStencil.depthCompareOp = GX2_COMPARE_FUNC_ALWAYS; + keys_.depthStencil.depthWriteEnable = gstate.isClearModeDepthMask(); + if (gstate.isClearModeDepthMask()) { + framebufferManager_->SetDepthUpdated(); + } + + // Stencil Test + bool alphaMask = gstate.isClearModeAlphaMask(); + if (alphaMask) { + keys_.depthStencil.stencilTestEnable = true; + keys_.depthStencil.stencilCompareFunc = GX2_COMPARE_FUNC_ALWAYS; + keys_.depthStencil.stencilPassOp = GX2_STENCIL_FUNCTION_REPLACE; + keys_.depthStencil.stencilFailOp = GX2_STENCIL_FUNCTION_REPLACE; + keys_.depthStencil.stencilDepthFailOp = GX2_STENCIL_FUNCTION_REPLACE; + dynState_.useStencil = true; + // In clear mode, the stencil value is set to the alpha value of the vertex. + // A normal clear will be 2 points, the second point has the color. + // We override this value in the pipeline from software transform for clear rectangles. + dynState_.stencilRef = 0xFF; + // But we still apply the stencil write mask. + keys_.depthStencil.stencilWriteMask = (~gstate.getStencilWriteMask()) & 0xFF; + } else { + keys_.depthStencil.stencilTestEnable = false; + dynState_.useStencil = false; + } + + } else { + keys_.depthStencil.value = 0; + // Depth Test + if (gstate.isDepthTestEnabled()) { + keys_.depthStencil.depthTestEnable = true; + keys_.depthStencil.depthCompareOp = compareOps[gstate.getDepthTestFunction()]; + keys_.depthStencil.depthWriteEnable = gstate.isDepthWriteEnabled(); + if (gstate.isDepthWriteEnabled()) { + framebufferManager_->SetDepthUpdated(); + } + } else { + keys_.depthStencil.depthTestEnable = false; + keys_.depthStencil.depthWriteEnable = false; + keys_.depthStencil.depthCompareOp = GX2_COMPARE_FUNC_ALWAYS; + } + + GenericStencilFuncState stencilState; + ConvertStencilFuncState(stencilState); + + // Stencil Test + if (stencilState.enabled) { + keys_.depthStencil.stencilTestEnable = true; + keys_.depthStencil.stencilCompareFunc = compareOps[stencilState.testFunc]; + keys_.depthStencil.stencilPassOp = stencilOps[stencilState.zPass]; + keys_.depthStencil.stencilFailOp = stencilOps[stencilState.sFail]; + keys_.depthStencil.stencilDepthFailOp = stencilOps[stencilState.zFail]; + keys_.depthStencil.stencilCompareMask = stencilState.testMask; + keys_.depthStencil.stencilWriteMask = stencilState.writeMask; + dynState_.useStencil = true; + dynState_.stencilRef = stencilState.testRef; + } else { + keys_.depthStencil.stencilTestEnable = false; + dynState_.useStencil = false; + } + } + GX2DepthStencilControlReg *ds = depthStencilCache_.Get(keys_.depthStencil.value); + if (ds == nullptr) { + ds = new GX2DepthStencilControlReg; + GX2InitDepthStencilControlReg(ds, keys_.depthStencil.depthTestEnable, keys_.depthStencil.depthWriteEnable, keys_.depthStencil.depthCompareOp, keys_.depthStencil.stencilTestEnable, keys_.depthStencil.stencilTestEnable, keys_.depthStencil.stencilCompareFunc, keys_.depthStencil.stencilPassOp, keys_.depthStencil.stencilDepthFailOp, keys_.depthStencil.stencilFailOp, keys_.depthStencil.stencilCompareFunc, keys_.depthStencil.stencilPassOp, keys_.depthStencil.stencilDepthFailOp, keys_.depthStencil.stencilFailOp); + depthStencilCache_.Insert(keys_.depthStencil.value, ds); + } + depthStencilState_ = ds; + } + + if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) { + ViewportAndScissor vpAndScissor; + ConvertViewportAndScissor(useBufferedRendering, framebufferManager_->GetRenderWidth(), framebufferManager_->GetRenderHeight(), framebufferManager_->GetTargetBufferWidth(), framebufferManager_->GetTargetBufferHeight(), vpAndScissor); + + float depthMin = vpAndScissor.depthRangeMin; + float depthMax = vpAndScissor.depthRangeMax; + + if (depthMin < 0.0f) + depthMin = 0.0f; + if (depthMax > 1.0f) + depthMax = 1.0f; + if (vpAndScissor.dirtyDepth) { + gstate_c.Dirty(DIRTY_DEPTHRANGE); + } + + Draw::Viewport &vp = dynState_.viewport; + vp.TopLeftX = vpAndScissor.viewportX; + vp.TopLeftY = vpAndScissor.viewportY; + vp.Width = vpAndScissor.viewportW; + vp.Height = vpAndScissor.viewportH; + vp.MinDepth = depthMin; + vp.MaxDepth = depthMax; + + if (vpAndScissor.dirtyProj) { + gstate_c.Dirty(DIRTY_PROJMATRIX); + } + + GX2_RECT &scissor = dynState_.scissor; + if (vpAndScissor.scissorEnable) { + scissor.left = vpAndScissor.scissorX; + scissor.top = vpAndScissor.scissorY; + scissor.right = vpAndScissor.scissorX + std::max(0, vpAndScissor.scissorW); + scissor.bottom = vpAndScissor.scissorY + std::max(0, vpAndScissor.scissorH); + } else { + scissor.left = 0; + scissor.top = 0; + scissor.right = framebufferManager_->GetRenderWidth(); + scissor.bottom = framebufferManager_->GetRenderHeight(); + } + } + + if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) { + textureCache_->SetTexture(); + gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS); + } else if (gstate.getTextureAddress(0) == ((gstate.getFrameBufRawAddress() | 0x04000000) & 0x3FFFFFFF)) { + // This catches the case of clearing a texture. + gstate_c.Dirty(DIRTY_TEXTURE_IMAGE); + } +} + +void DrawEngineGX2::ApplyDrawStateLate(bool applyStencilRef, uint8_t stencilRef) { + PROFILE_THIS_SCOPE("late drawState"); + if (!gstate.isModeClear()) { + if (fboTexNeedBind_) { + framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); + // No sampler required, we do a plain Load in the pixel shader. + fboTexBound_ = true; + fboTexNeedBind_ = false; + } + textureCache_->ApplyTexture(); + } + + // we go through Draw here because it automatically handles screen rotation, as needed in UWP on mobiles. + if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) { + GX2ColorBuffer *current_color_buffer = (GX2ColorBuffer *)draw_->GetNativeObject(Draw::NativeObject::BACKBUFFER_COLOR_VIEW); + draw_->SetViewports(1, &dynState_.viewport); + int left = std::min(std::max(0, dynState_.scissor.left), (int)current_color_buffer->surface.width - 1); + int top = std::min(std::max(0, dynState_.scissor.top), (int)current_color_buffer->surface.height - 1); + int width = std::min(dynState_.scissor.right - dynState_.scissor.left, (int)current_color_buffer->surface.width - left); + int height = std::min(dynState_.scissor.bottom - dynState_.scissor.top, (int)current_color_buffer->surface.height - top); + draw_->SetScissorRect(left, top, width, height); + } + if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) { + GX2SetCullOnlyControl(rasterState_->frontFace_, rasterState_->cullFront_, rasterState_->cullBack_); + } + if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) { + // Need to do this AFTER ApplyTexture because the process of depallettization can ruin the blend state. + float blendColor[4]; + Uint8x4ToFloat4(blendColor, dynState_.blendColor); + GX2SetBlendControlReg(&blendState_->blend); + GX2SetColorControlReg(&blendState_->color); + GX2SetTargetChannelMasksReg(&blendState_->mask); + GX2SetBlendConstantColorReg((GX2BlendConstantColorReg *)blendColor); + } + if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE) || applyStencilRef) { + GX2SetDepthStencilControlReg(depthStencilState_); + if (!applyStencilRef) + stencilRef = dynState_.stencilRef; + GX2SetStencilMask(0xFF, 0xFF, stencilRef, 0xFF, 0xFF, stencilRef); + } + gstate_c.Clean(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_BLEND_STATE); + + // Must dirty blend state here so we re-copy next time. Example: Lunar's spell effects. + if (fboTexBound_) + gstate_c.Dirty(DIRTY_BLEND_STATE); +} diff --git a/GPU/GX2/StateMappingGX2.h b/GPU/GX2/StateMappingGX2.h new file mode 100644 index 000000000000..0af6b4febc2a --- /dev/null +++ b/GPU/GX2/StateMappingGX2.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include + +#include "thin3d/thin3d.h" + +// TODO: Do this more progressively. No need to compute the entire state if the entire state hasn't changed. + +struct GX2BlendKey { + union { + uint64_t value; + struct { + // Blend + bool blendEnable : 1; + GX2BlendMode srcColor : 5; + GX2BlendMode destColor : 5; + GX2BlendMode srcAlpha : 5; + GX2BlendMode destAlpha : 5; + GX2BlendCombineMode blendOpColor : 3; + GX2BlendCombineMode blendOpAlpha : 3; + bool logicOpEnable : 1; + GX2LogicOp logicOp : 8; + GX2ChannelMask colorWriteMask : 4; + }; + }; +}; + +struct GX2DepthStencilKey { + union { + uint64_t value; + struct { + // Depth/Stencil + bool depthTestEnable : 1; + bool depthWriteEnable : 1; + GX2CompareFunction depthCompareOp : 4; // GX2_COMPARISON (-1 and we could fit it in 3 bits) + bool stencilTestEnable : 1; + GX2CompareFunction stencilCompareFunc : 4; // GX2_COMPARISON + GX2StencilFunction stencilPassOp : 4; // GX2_STENCIL_OP + GX2StencilFunction stencilFailOp : 4; // GX2_STENCIL_OP + GX2StencilFunction stencilDepthFailOp : 4; // GX2_STENCIL_OP + unsigned int stencilWriteMask : 8; // Unfortunately these are baked into the state on GX2 + unsigned int stencilCompareMask : 8; + }; + }; +}; + +struct GX2RasterKey { + union { + uint32_t value; + struct { + GX2FrontFace frontFace : 1; + bool cullFront : 1; + bool cullBack : 1; + }; + }; +}; + +// In GX2 we cache blend state objects etc, and we simply emit keys, which are then also used to create these objects. +struct GX2StateKeys { + GX2BlendKey blend; + GX2DepthStencilKey depthStencil; + GX2RasterKey raster; +}; + +struct GX2_RECT { + int left; + int top; + int right; + int bottom; +}; + +struct GX2DynamicState { + int topology; + bool useBlendColor; + uint32_t blendColor; + bool useStencil; + uint8_t stencilRef; + Draw::Viewport viewport; + GX2_RECT scissor; +}; diff --git a/GPU/GX2/StencilBufferGX2.cpp b/GPU/GX2/StencilBufferGX2.cpp new file mode 100644 index 000000000000..7edd4f7b4144 --- /dev/null +++ b/GPU/GX2/StencilBufferGX2.cpp @@ -0,0 +1,209 @@ +// Copyright (c) 2014- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include + +#include "ext/native/thin3d/thin3d.h" +#include "Core/Reporting.h" +#include "GPU/Common/StencilCommon.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/FragmentShaderGeneratorGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/GX2Util.h" +#include "GPU/GX2/GX2Shaders.h" + +static const char *stencil_ps = R"( +SamplerState samp : register(s0); +Texture2D tex : register(t0); +cbuffer base : register(b0) { + int4 u_stencilValue; +}; +struct PS_IN { + float2 v_texcoord0 : TEXCOORD0; +}; +float4 main(PS_IN In) : SV_Target { + float4 index = tex.Sample(samp, In.v_texcoord0); + int indexBits = int(index.a * 255.99); + if ((indexBits & u_stencilValue.x) == 0) + discard; + return index.aaaa; +} +)"; + +// static const char *stencil_ps_fast; + +static const char *stencil_vs = R"( +struct VS_IN { + float4 a_position : POSITION; + float2 a_texcoord0 : TEXCOORD0; +}; +struct VS_OUT { + float2 v_texcoord0 : TEXCOORD0; + float4 position : SV_Position; +}; +VS_OUT main(VS_IN In) { + VS_OUT Out; + Out.position = In.a_position; + Out.v_texcoord0 = In.a_texcoord0; + return Out; +} +)"; + +// TODO : If SV_StencilRef is available (?) then this can be done in a single pass. +bool FramebufferManagerGX2::NotifyStencilUpload(u32 addr, int size, StencilUpload flags) { + if (!MayIntersectFramebuffer(addr)) { + return false; + } + + VirtualFramebuffer *dstBuffer = 0; + for (size_t i = 0; i < vfbs_.size(); ++i) { + VirtualFramebuffer *vfb = vfbs_[i]; + if (vfb->fb_address == addr) { + dstBuffer = vfb; + } + } + if (!dstBuffer) { + return false; + } + + int values = 0; + u8 usedBits = 0; + + const u8 *src = Memory::GetPointer(addr); + if (!src) { + return false; + } + + switch (dstBuffer->format) { + case GE_FORMAT_565: + // Well, this doesn't make much sense. + return false; + case GE_FORMAT_5551: + usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 2; + break; + case GE_FORMAT_4444: + usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 16; + break; + case GE_FORMAT_8888: + usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 256; + break; + case GE_FORMAT_INVALID: + // Impossible. + break; + } + + if (usedBits == 0) { + if (flags == StencilUpload::STENCIL_IS_ZERO) { + // Common when creating buffers, it's already 0. We're done. + return false; + } + + // Clear stencil+alpha but not color. Only way is to draw a quad. + GX2SetColorControlReg(&StockGX2::blendDisabledColorWrite); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0x8]); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetDepthStencilControlReg(&StockGX2::depthDisabledStencilWrite); + GX2SetAttribBuffer(0, 4 * quadStride_, quadStride_, fsQuadBuffer_); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE); + return true; + } + + if (!stencilValueBuffer_) { + static_assert(!(sizeof(StencilValueUB) & 0x3F), "sizeof(StencilValueUB) must to be aligned to 64bytes!"); + stencilValueBuffer_ = (StencilValueUB *)MEM2_alloc(sizeof(StencilValueUB), GX2_UNIFORM_BLOCK_ALIGNMENT); + memset(stencilValueBuffer_, 0, sizeof(StencilValueUB)); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK, stencilValueBuffer_, sizeof(StencilValueUB)); + } + + shaderManagerGX2_->DirtyLastShader(); + + u16 w = dstBuffer->renderWidth; + u16 h = dstBuffer->renderHeight; + float u1 = 1.0f; + float v1 = 1.0f; + Draw::Texture *tex = MakePixelTexture(src, dstBuffer->format, dstBuffer->fb_stride, dstBuffer->bufferWidth, dstBuffer->bufferHeight, u1, v1); + if (!tex) + return false; + if (dstBuffer->fbo) { + // Typically, STENCIL_IS_ZERO means it's already bound. + Draw::RPAction stencilAction = flags == StencilUpload::STENCIL_IS_ZERO ? Draw::RPAction::KEEP : Draw::RPAction::CLEAR; + draw_->BindFramebufferAsRenderTarget(dstBuffer->fbo, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, stencilAction }, "NotifyStencilUpload"); + } else { + // something is wrong... + } + GX2SetViewport(0.0f, 0.0f, (float)w, (float)h, 0.0f, 1.0f); + GX2SetScissor(0, 0, w, h); + gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE); + + float coord[20] = { + -1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, u1, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, v1, 1.0f, -1.0f, 0.0f, u1, v1, + }; + + memcpy(quadBuffer_, coord, sizeof(float) * 4 * 5); + GX2Invalidate(GX2_INVALIDATE_MODE_ATTRIBUTE_BUFFER, quadBuffer_, sizeof(float) * 4 * 5); + + shaderManagerGX2_->DirtyLastShader(); + textureCacheGX2_->ForgetLastTexture(); + + GX2SetColorControlReg(&StockGX2::blendColorDisabled); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0x0]); + GX2SetFetchShader(&quadFetchShader_); + GX2SetPixelShader(&stencilUploadPSshaderGX2); + GX2SetVertexShader(&stencilUploadVSshaderGX2); + draw_->BindTextures(0, 1, &tex); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetAttribBuffer(0, 4 * quadStride_, quadStride_, fsQuadBuffer_); + GX2SetPixelSampler(&StockGX2::samplerPoint2DClamp, 0); + GX2SetDepthStencilControlReg(&StockGX2::depthDisabledStencilWrite); + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE); + + for (int i = 1; i < values; i += i) { + if (!(usedBits & i)) { + // It's already zero, let's skip it. + continue; + } + uint8_t mask = 0; + uint8_t value = 0; + if (dstBuffer->format == GE_FORMAT_4444) { + mask = i | (i << 4); + value = i * 16; + } else if (dstBuffer->format == GE_FORMAT_5551) { + mask = 0xFF; + value = i * 128; + } else { + mask = i; + value = i; + } + + GX2SetDepthStencilControlReg(&StockGX2::depthDisabledStencilWrite); + GX2SetStencilMaskReg(&stencilMaskStates_[mask]); + + stencilValueBuffer_->u_stencilValue[0] = value; + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK, stencilValueBuffer_, sizeof(StencilValueUB)); + GX2SetPixelUniformBlock(1, sizeof(StencilValueUB), stencilValueBuffer_); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + } + tex->Release(); + RebindFramebuffer("NotifyStencilUpload"); + return true; +} diff --git a/GPU/GX2/TextureCacheGX2.cpp b/GPU/GX2/TextureCacheGX2.cpp new file mode 100644 index 000000000000..bc41dba87bb8 --- /dev/null +++ b/GPU/GX2/TextureCacheGX2.cpp @@ -0,0 +1,754 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include +#include +#include +#include + +#include + +#include "Core/MemMap.h" +#include "Core/Reporting.h" +#include "profiler/profiler.h" +#include "GPU/ge_constants.h" +#include "GPU/GPUState.h" +#include "GPU/GX2/FragmentShaderGeneratorGX2.h" +#include "GPU/GX2/TextureCacheGX2.h" +#include "GPU/GX2/FramebufferManagerGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" +#include "GPU/GX2/DepalettizeShaderGX2.h" +#include "GPU/GX2/GX2Util.h" +#include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/Common/TextureDecoder.h" +#include "Core/Config.h" +#include "Core/Host.h" + +#include "ext/xxhash.h" +#include "math/math_util.h" + +#define INVALID_TEX (GX2Texture *)(-1LL) + +SamplerCacheGX2::~SamplerCacheGX2() { + for (auto &iter : cache_) { + delete iter.second; + } +} + +GX2Sampler *SamplerCacheGX2::GetOrCreateSampler(const SamplerCacheKey &key) { + auto iter = cache_.find(key); + if (iter != cache_.end()) { + return iter->second; + } + GX2Sampler *sampler = new GX2Sampler(); + + GX2TexClampMode sClamp = key.sClamp ? GX2_TEX_CLAMP_MODE_CLAMP : GX2_TEX_CLAMP_MODE_WRAP; + GX2TexClampMode tClamp = key.tClamp ? GX2_TEX_CLAMP_MODE_CLAMP : GX2_TEX_CLAMP_MODE_WRAP; + GX2InitSampler(sampler, sClamp, key.magFilt ? GX2_TEX_XY_FILTER_MODE_LINEAR : GX2_TEX_XY_FILTER_MODE_POINT); + GX2InitSamplerClamping(sampler, sClamp, tClamp, sClamp); + // TODO: GX2TexAnisoRatio ? + GX2InitSamplerXYFilter(sampler, key.minFilt ? GX2_TEX_XY_FILTER_MODE_LINEAR : GX2_TEX_XY_FILTER_MODE_POINT, key.magFilt ? GX2_TEX_XY_FILTER_MODE_LINEAR : GX2_TEX_XY_FILTER_MODE_POINT, GX2_TEX_ANISO_RATIO_NONE); + GX2InitSamplerZMFilter(sampler, GX2_TEX_Z_FILTER_MODE_POINT, key.mipFilt ? GX2_TEX_MIP_FILTER_MODE_LINEAR : GX2_TEX_MIP_FILTER_MODE_POINT); + GX2InitSamplerBorderType(sampler, GX2_TEX_BORDER_TYPE_WHITE); + + cache_[key] = sampler; + return sampler; +} + +TextureCacheGX2::TextureCacheGX2(Draw::DrawContext *draw) : TextureCacheCommon(draw) { + context_ = (GX2ContextState *)draw->GetNativeObject(Draw::NativeObject::CONTEXT); + + isBgraBackend_ = true; + lastBoundTexture = INVALID_TEX; + + SetupTextureDecoder(); + + nextTexture_ = nullptr; +} + +TextureCacheGX2::~TextureCacheGX2() { + // pFramebufferVertexDecl->Release(); + Clear(true); +} + +void TextureCacheGX2::SetFramebufferManager(FramebufferManagerGX2 *fbManager) { + framebufferManagerGX2_ = fbManager; + framebufferManager_ = fbManager; +} + +void TextureCacheGX2::ReleaseTexture(TexCacheEntry *entry, bool delete_them) { + GX2Texture *texture = (GX2Texture *)entry->texturePtr; + if (texture) { + if (delete_them) { + MEM2_free(texture->surface.image); + delete texture; + } + entry->texturePtr = nullptr; + } +} + +void TextureCacheGX2::ForgetLastTexture() { + InvalidateLastTexture(); + gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); + // GX2SetPixelTexture(nullptr, 0); +} + +void TextureCacheGX2::InvalidateLastTexture(TexCacheEntry *entry) { + if (!entry || entry->texturePtr == lastBoundTexture) { + lastBoundTexture = INVALID_TEX; + } +} + +void TextureCacheGX2::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, SamplerCacheKey &key) { + int minFilt; + int magFilt; + bool sClamp; + bool tClamp; + float lodBias; + GETexLevelMode mode; + GetSamplingParams(minFilt, magFilt, sClamp, tClamp, lodBias, 0, 0, mode); + + key.minFilt = minFilt & 1; + key.mipFilt = 0; + key.magFilt = magFilt & 1; + key.sClamp = sClamp; + key.tClamp = tClamp; + + // Often the framebuffer will not match the texture size. We'll wrap/clamp in the shader in that case. + // This happens whether we have OES_texture_npot or not. + int w = gstate.getTextureWidth(0); + int h = gstate.getTextureHeight(0); + if (w != bufferWidth || h != bufferHeight) { + key.sClamp = true; + key.tClamp = true; + } +} + +void TextureCacheGX2::StartFrame() { + InvalidateLastTexture(); + timesInvalidatedAllThisFrame_ = 0; + + if (texelsScaledThisFrame_) { + // INFO_LOG(G3D, "Scaled %i texels", texelsScaledThisFrame_); + } + texelsScaledThisFrame_ = 0; + if (clearCacheNextFrame_) { + Clear(true); + clearCacheNextFrame_ = false; + } else { + Decimate(); + } +} + +void TextureCacheGX2::UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) { + const u32 clutBaseBytes = clutBase * (clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16)); + // Technically, these extra bytes weren't loaded, but hopefully it was loaded earlier. + // If not, we're going to hash random data, which hopefully doesn't cause a performance issue. + // + // TODO: Actually, this seems like a hack. The game can upload part of a CLUT and reference other data. + // clutTotalBytes_ is the last amount uploaded. We should hash clutMaxBytes_, but this will often hash + // unrelated old entries for small palettes. + // Adding clutBaseBytes may just be mitigating this for some usage patterns. + const u32 clutExtendedBytes = std::min(clutTotalBytes_ + clutBaseBytes, clutMaxBytes_); + + if (replacer_.Enabled()) + clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888); + else + clutHash_ = XXH3_64bits((const char *)clutBufRaw_, clutExtendedBytes) & 0xFFFFFFFF; + clutBuf_ = clutBufRaw_; + + // Special optimization: fonts typically draw clut4 with just alpha values in a single color. + clutAlphaLinear_ = false; + clutAlphaLinearColor_ = 0; + if (clutFormat == GE_CMODE_16BIT_ABGR4444 && clutIndexIsSimple) { + const u16_le *clut = GetCurrentClut(); + clutAlphaLinear_ = true; + clutAlphaLinearColor_ = clut[15] & 0x0FFF; + for (int i = 0; i < 16; ++i) { + u16 step = clutAlphaLinearColor_ | (i << 12); + if (clut[i] != step) { + clutAlphaLinear_ = false; + break; + } + } + } + + clutLastFormat_ = gstate.clutformat; +} + +void TextureCacheGX2::BindTexture(TexCacheEntry *entry) { + GX2Texture *texture = (GX2Texture *)entry->texturePtr; + if (texture != lastBoundTexture) { + GX2SetPixelTexture(texture, 0); + lastBoundTexture = texture; + } + SamplerCacheKey key{}; + UpdateSamplingParams(*entry, key); + GX2Sampler *sampler = samplerCache_.GetOrCreateSampler(key); + GX2SetPixelSampler(sampler, 0); +} + +void TextureCacheGX2::Unbind() { + // GX2SetPixelTexture(nullptr, 0); + InvalidateLastTexture(); +} + +class TextureShaderApplierGX2 { +public: + struct Pos { + Pos(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {} + Pos() {} + + float x; + float y; + float z; + }; + struct UV { + UV(float u_, float v_) : u(u_), v(v_) {} + UV() {} + + float u; + float v; + }; + + struct PosUV { + Pos pos; + UV uv; + }; + + TextureShaderApplierGX2(GX2ContextState *context, GX2PixelShader *pshader, void *dynamicBuffer, float bufferW, float bufferH, int renderW, int renderH, float xoff, float yoff) : context_(context), pshader_(pshader), bufferW_(bufferW), bufferH_(bufferH), renderW_(renderW), renderH_(renderH) { + static const Pos pos[4] = { + { -1, 1, 0 }, + { 1, 1, 0 }, + { -1, -1, 0 }, + { 1, -1, 0 }, + }; + static const UV uv[4] = { + { 0, 0 }, + { 1, 0 }, + { 0, 1 }, + { 1, 1 }, + }; + + for (int i = 0; i < 4; ++i) { + verts_[i].pos = pos[i]; + verts_[i].pos.x += xoff; + verts_[i].pos.y += yoff; + verts_[i].uv = uv[i]; + } + memcpy(dynamicBuffer, &verts_[0], 4 * 5 * sizeof(float)); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER, dynamicBuffer, 4 * 5 * sizeof(float)); + vbuffer_ = dynamicBuffer; + } + + void ApplyBounds(const KnownVertexBounds &bounds, u32 uoff, u32 voff, float xoff, float yoff) { + // If min is not < max, then we don't have values (wasn't set during decode.) + if (bounds.minV < bounds.maxV) { + const float invWidth = 1.0f / bufferW_; + const float invHeight = 1.0f / bufferH_; + // Inverse of half = double. + const float invHalfWidth = invWidth * 2.0f; + const float invHalfHeight = invHeight * 2.0f; + + const int u1 = bounds.minU + uoff; + const int v1 = bounds.minV + voff; + const int u2 = bounds.maxU + uoff; + const int v2 = bounds.maxV + voff; + + const float left = u1 * invHalfWidth - 1.0f + xoff; + const float right = u2 * invHalfWidth - 1.0f + xoff; + const float top = v1 * invHalfHeight - 1.0f + yoff; + const float bottom = v2 * invHalfHeight - 1.0f + yoff; + float z = 0.0f; + // Points are: BL, BR, TL, TR. + verts_[0].pos = Pos(left, bottom, z); + verts_[1].pos = Pos(right, bottom, z); + verts_[2].pos = Pos(left, top, z); + verts_[3].pos = Pos(right, top, z); + + // And also the UVs, same order. + const float uvleft = u1 * invWidth; + const float uvright = u2 * invWidth; + const float uvtop = v1 * invHeight; + const float uvbottom = v2 * invHeight; + verts_[0].uv = UV(uvleft, uvbottom); + verts_[1].uv = UV(uvright, uvbottom); + verts_[2].uv = UV(uvleft, uvtop); + verts_[3].uv = UV(uvright, uvtop); + + // We need to reapply the texture next time since we cropped UV. + gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); + } + } + + void Use(GX2VertexShader *vshader, GX2FetchShader *fshader) { + GX2SetPixelShader(pshader_); + GX2SetVertexShader(vshader); + GX2SetFetchShader(fshader); + } + + void Shade() { + GX2SetViewport(0.0f, 0.0f, (float)renderW_, (float)renderH_, 0.0f, 1.0f); + GX2SetScissor(0, 0, renderW_, renderH_); + GX2SetColorControlReg(&StockGX2::blendDisabledColorWrite); + GX2SetTargetChannelMasksReg(&StockGX2::TargetChannelMasks[0xF]); + GX2SetDepthStencilControlReg(&StockGX2::depthStencilDisabled); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + GX2SetAttribBuffer(0, 4 * stride_, stride_, (u8*)vbuffer_ + offset_); + GX2DrawEx(GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, 4, 0, 1); + gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE); + } + +protected: + GX2ContextState *context_; + GX2PixelShader *pshader_; + void *vbuffer_; + PosUV verts_[4]; + u32 stride_ = sizeof(PosUV); + u32 offset_ = 0; + float bufferW_; + float bufferH_; + int renderW_; + int renderH_; +}; + +void TextureCacheGX2::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) { + GX2PixelShader *pshader = nullptr; + u32 clutMode = gstate.clutformat & 0xFFFFFF; + if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) { + pshader = depalShaderCache_->GetDepalettizePixelShader(clutMode, framebuffer->drawnFormat); + } + + if (pshader) { + bool expand32 = !gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS); + const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); + GX2Texture *clutTexture = depalShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBuf_, expand32); + + Draw::Framebuffer *depalFBO = framebufferManagerGX2_->GetTempFBO(TempFBO::DEPAL, framebuffer->renderWidth, framebuffer->renderHeight, Draw::FBO_8888); + shaderManager_->DirtyLastShader(); + draw_->BindPipeline(nullptr); + + float xoff = -0.5f / framebuffer->renderWidth; + float yoff = 0.5f / framebuffer->renderHeight; + + TextureShaderApplierGX2 shaderApply(context_, pshader, framebufferManagerGX2_->GetDynamicQuadBuffer(), framebuffer->bufferWidth, framebuffer->bufferHeight, framebuffer->renderWidth, framebuffer->renderHeight, xoff, yoff); + shaderApply.ApplyBounds(gstate_c.vertBounds, gstate_c.curTextureXOffset, gstate_c.curTextureYOffset, xoff, yoff); + shaderApply.Use(depalShaderCache_->GetDepalettizeVertexShader(), depalShaderCache_->GetFetchShader()); + + GX2SetPixelTexture(clutTexture, 1); + framebufferManagerGX2_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_SKIP_COPY); + GX2SetPixelSampler(&StockGX2::samplerPoint2DWrap, 0); + draw_->BindFramebufferAsRenderTarget(depalFBO, { Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE }, "ApplyTextureFramebuffer"); + shaderApply.Shade(); + + framebufferManagerGX2_->RebindFramebuffer("ApplyTextureFramebuffer"); + draw_->BindFramebufferAsTexture(depalFBO, 0, Draw::FB_COLOR_BIT, 0); + + const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); + const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; + + TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, GetClutDestFormatGX2(clutFormat), clutTotalColors, clutTotalColors, 1); + gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + } else { + entry->status &= ~TexCacheEntry::STATUS_DEPALETTIZE; + + framebufferManagerGX2_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); + + gstate_c.SetTextureFullAlpha(gstate.getTextureFormat() == GE_TFMT_5650); + framebufferManagerGX2_->RebindFramebuffer("ApplyTextureFramebuffer"); // Probably not necessary. + } + SamplerCacheKey samplerKey{}; + SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, samplerKey); + GX2Sampler *sampler = samplerCache_.GetOrCreateSampler(samplerKey); + GX2SetPixelSampler(sampler, 0); + InvalidateLastTexture(); + + gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE | DIRTY_FRAGMENTSHADER_STATE); +} + +void TextureCacheGX2::BuildTexture(TexCacheEntry *const entry) { + entry->status &= ~TexCacheEntry::STATUS_ALPHA_MASK; + + // For the estimate, we assume cluts always point to 8888 for simplicity. + cacheSizeEstimate_ += EstimateTexMemoryUsage(entry); + + // TODO: If a framebuffer is attached here, might end up with a bad entry.texture. + // Should just always create one here or something (like GLES.) + + if (entry->framebuffer) { + // Nothing else to do here. + return; + } + + if ((entry->bufw == 0 || (gstate.texbufwidth[0] & 0xf800) != 0) && entry->addr >= PSP_GetKernelMemoryEnd()) { + ERROR_LOG_REPORT(G3D, "Texture with unexpected bufw (full=%d)", gstate.texbufwidth[0] & 0xffff); + // Proceeding here can cause a crash. + return; + } + + // Adjust maxLevel to actually present levels.. + bool badMipSizes = false; + int maxLevel = entry->maxLevel; + for (int i = 0; i <= maxLevel; i++) { + // If encountering levels pointing to nothing, adjust max level. + u32 levelTexaddr = gstate.getTextureAddress(i); + if (!Memory::IsValidAddress(levelTexaddr)) { + maxLevel = i - 1; + break; + } + + // If size reaches 1, stop, and override maxlevel. + int tw = gstate.getTextureWidth(i); + int th = gstate.getTextureHeight(i); + if (tw == 1 || th == 1) { + maxLevel = i; + break; + } + + if (i > 0 && gstate_c.Supports(GPU_SUPPORTS_TEXTURE_LOD_CONTROL)) { + if (tw != 1 && tw != (gstate.getTextureWidth(i - 1) >> 1)) + badMipSizes = true; + else if (th != 1 && th != (gstate.getTextureHeight(i - 1) >> 1)) + badMipSizes = true; + } + } + + int scaleFactor = standardScaleFactor_; + + // Rachet down scale factor in low-memory mode. + if (lowMemoryMode_) { + // Keep it even, though, just in case of npot troubles. + scaleFactor = scaleFactor > 4 ? 4 : (scaleFactor > 2 ? 2 : 1); + } + + u64 cachekey = replacer_.Enabled() ? entry->CacheKey() : 0; + int w = gstate.getTextureWidth(0); + int h = gstate.getTextureHeight(0); + ReplacedTexture &replaced = replacer_.FindReplacement(cachekey, entry->fullhash, w, h); + if (replaced.GetSize(0, w, h)) { + // We're replacing, so we won't scale. + scaleFactor = 1; + entry->status |= TexCacheEntry::STATUS_IS_SCALED; + maxLevel = replaced.MaxLevel(); + badMipSizes = false; + } + + // Don't scale the PPGe texture. + if (entry->addr > 0x05000000 && entry->addr < PSP_GetKernelMemoryEnd()) + scaleFactor = 1; + if ((entry->status & TexCacheEntry::STATUS_CHANGE_FREQUENT) != 0 && scaleFactor != 1) { + // Remember for later that we /wanted/ to scale this texture. + entry->status |= TexCacheEntry::STATUS_TO_SCALE; + scaleFactor = 1; + } + + if (scaleFactor != 1) { + if (texelsScaledThisFrame_ >= TEXCACHE_MAX_TEXELS_SCALED) { + entry->status |= TexCacheEntry::STATUS_TO_SCALE; + scaleFactor = 1; + } else { + entry->status &= ~TexCacheEntry::STATUS_TO_SCALE; + entry->status |= TexCacheEntry::STATUS_IS_SCALED; + texelsScaledThisFrame_ += w * h; + } + } + + // Seems to cause problems in Tactics Ogre. + if (badMipSizes) { + maxLevel = 0; + } + + GX2SurfaceFormat dstFmt = GetDestFormat(GETextureFormat(entry->format), gstate.getClutPaletteFormat()); + + if (IsFakeMipmapChange()) { + // NOTE: Since the level is not part of the cache key, we assume it never changes. + u8 level = std::max(0, gstate.getTexLevelOffset16() / 16); + LoadTextureLevel(*entry, replaced, level, maxLevel, scaleFactor, dstFmt); + } else { + LoadTextureLevel(*entry, replaced, 0, maxLevel, scaleFactor, dstFmt); + } + + if (!entry->texturePtr) { + return; + } + + // Mipmapping is only enabled when texture scaling is disabled. + if (maxLevel > 0 && scaleFactor == 1) { + for (int i = 1; i <= maxLevel; i++) { + LoadTextureLevel(*entry, replaced, i, maxLevel, scaleFactor, dstFmt); + } + } + + if (maxLevel == 0) { + entry->status |= TexCacheEntry::STATUS_BAD_MIPS; + } else { + entry->status &= ~TexCacheEntry::STATUS_BAD_MIPS; + } + if (replaced.Valid()) { + entry->SetAlphaStatus(TexCacheEntry::TexStatus(replaced.AlphaStatus())); + } +} + +GX2SurfaceFormat GetClutDestFormatGX2(GEPaletteFormat format) { + switch (format) { + case GE_CMODE_16BIT_ABGR4444: return GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4; + case GE_CMODE_16BIT_ABGR5551: return GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1; + case GE_CMODE_16BIT_BGR5650: return GX2_SURFACE_FORMAT_UNORM_R5_G6_B5; + case GE_CMODE_32BIT_ABGR8888: return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + } + // Should never be here ! + return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; +} + +GX2SurfaceFormat TextureCacheGX2::GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const { + if (!gstate_c.Supports(GPU_SUPPORTS_16BIT_FORMATS)) { + return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + } + + switch (format) { + case GE_TFMT_CLUT4: + case GE_TFMT_CLUT8: + case GE_TFMT_CLUT16: + case GE_TFMT_CLUT32: return GetClutDestFormatGX2(clutFormat); + case GE_TFMT_4444: return GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4; + case GE_TFMT_5551: return GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1; + case GE_TFMT_5650: return GX2_SURFACE_FORMAT_UNORM_R5_G6_B5; + case GE_TFMT_8888: + case GE_TFMT_DXT1: + case GE_TFMT_DXT3: + case GE_TFMT_DXT5: + default: return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + } +} + +TexCacheEntry::TexStatus TextureCacheGX2::CheckAlpha(const u32_le *pixelData, u32 dstFmt, int stride, int w, int h) { + CheckAlphaResult res; + switch (dstFmt) { + case GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4: res = CheckAlphaRGBA4444Basic(pixelData, stride, w, h); break; + case GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1: res = CheckAlphaRGBA5551Basic(pixelData, stride, w, h); break; + case GX2_SURFACE_FORMAT_UNORM_R5_G6_B5: + // Never has any alpha. + res = CHECKALPHA_FULL; + break; + default: res = CheckAlphaRGBA8888Basic(pixelData, stride, w, h); break; + } + + return (TexCacheEntry::TexStatus)res; +} + +ReplacedTextureFormat FromGX2Format(u32 fmt) { + switch (fmt) { + case GX2_SURFACE_FORMAT_UNORM_R5_G6_B5: return ReplacedTextureFormat::F_5650; + case GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1: return ReplacedTextureFormat::F_5551; + case GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4: return ReplacedTextureFormat::F_4444; + case GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8: + default: return ReplacedTextureFormat::F_8888; + } +} + +GX2SurfaceFormat ToGX2Format(ReplacedTextureFormat fmt) { + switch (fmt) { + case ReplacedTextureFormat::F_5650: return GX2_SURFACE_FORMAT_UNORM_R5_G6_B5; + case ReplacedTextureFormat::F_5551: return GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1; + case ReplacedTextureFormat::F_4444: return GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4; + case ReplacedTextureFormat::F_8888: + default: return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + } +} + +void TextureCacheGX2::LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int maxLevel, int scaleFactor, GX2SurfaceFormat dstFmt) { + PROFILE_THIS_SCOPE("decodetex"); + int w = gstate.getTextureWidth(level); + int h = gstate.getTextureHeight(level); + + GX2Texture *texture = GX2Tex(&entry); + if ((level == 0 || IsFakeMipmapChange()) && texture == nullptr) { + // Create texture + int levels = scaleFactor == 1 ? maxLevel + 1 : 1; + int tw = w, th = h; + GX2SurfaceFormat tfmt = dstFmt; + if (replaced.GetSize(level, tw, th)) { + tfmt = ToGX2Format(replaced.Format(level)); + } else { + tw *= scaleFactor; + th *= scaleFactor; + if (scaleFactor > 1) { + tfmt = GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + } + } + + texture = new GX2Texture(); + texture->surface.width = tw; + texture->surface.height = th; + texture->surface.depth = 1; + texture->surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + texture->surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + texture->surface.use = GX2_SURFACE_USE_TEXTURE; + texture->viewNumSlices = 1; + texture->surface.format = tfmt; + switch(tfmt) + { + case GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4: + texture->compMap = GX2_COMP_SEL(_r, _g, _b, _a); + break; + case GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1: + texture->compMap = GX2_COMP_SEL(_r, _g, _b, _a); + break; + case GX2_SURFACE_FORMAT_UNORM_R5_G6_B5: + texture->compMap = GX2_COMP_SEL(_r, _g, _b, _1); + break; + default: + case GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8: + if (dstFmt == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8) { + texture->compMap = GX2_COMP_SEL(_r, _g, _b, _a); + } else { + // scaled 16-bit textures end up native-endian. + texture->compMap = GX2_COMP_SEL(_a, _b, _g, _r); + } + break; + } +#if 0 // TODO: mipmapping + texture->surface.mipLevels = IsFakeMipmapChange() ? 1 : levels; +#endif + + GX2CalcSurfaceSizeAndAlignment(&texture->surface); + GX2InitTextureRegs(texture); + texture->surface.image = MEM2_alloc(texture->surface.imageSize, texture->surface.alignment); + _assert_(texture->surface.image); + + entry.texturePtr = texture; + } + + gpuStats.numTexturesDecoded++; + + u32 *mapData = (u32*)texture->surface.image; + int mapRowPitch = texture->surface.pitch * ((texture->surface.format == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8)? 4 : 2); + if (replaced.GetSize(level, w, h)) { + replaced.Load(level, mapData, mapRowPitch); + dstFmt = ToGX2Format(replaced.Format(level)); + } else { + GETextureFormat tfmt = (GETextureFormat)entry.format; + GEPaletteFormat clutformat = gstate.getClutPaletteFormat(); + u32 texaddr = gstate.getTextureAddress(level); + int bufw = GetTextureBufw(level, texaddr, tfmt); + int bpp = dstFmt == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8 ? 4 : 2; + u32 *pixelData; + int decPitch; + if (scaleFactor > 1) { + tmpTexBufRearrange_.resize(std::max(bufw, w) * h); + pixelData = tmpTexBufRearrange_.data(); + // We want to end up with a neatly packed texture for scaling. + decPitch = w * bpp; + } else { + pixelData = (u32 *)mapData; + decPitch = mapRowPitch; + } + DecodeTextureLevel((u8 *)pixelData, decPitch, tfmt, clutformat, texaddr, level, bufw, false, false, false); + + // We check before scaling since scaling shouldn't invent alpha from a full alpha texture. + if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { + TexCacheEntry::TexStatus alphaStatus = CheckAlpha((u32_le *)pixelData, dstFmt, decPitch / bpp, w, h); + entry.SetAlphaStatus(alphaStatus, level); + } else { + entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); + } + + if (scaleFactor > 1) { + u32 scaleFmt = (u32)dstFmt; + scaler.ScaleAlways((u32 *)mapData, pixelData, scaleFmt, w, h, scaleFactor); + pixelData = (u32 *)mapData; + + // We always end up at 8888. Other parts assume this. + assert(scaleFmt == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8); + bpp = sizeof(u32); + decPitch = w * bpp; + + if (decPitch != mapRowPitch) { + // Rearrange in place to match the requested pitch. + // (it can only be larger than w * bpp, and a match is likely.) + for (int y = h - 1; y >= 0; --y) { + memcpy((u8 *)mapData + mapRowPitch * y, (u8 *)mapData + decPitch * y, w * bpp); + } + decPitch = mapRowPitch; + } + } + + if (replacer_.Enabled()) { + ReplacedTextureDecodeInfo replacedInfo; + replacedInfo.cachekey = entry.CacheKey(); + replacedInfo.hash = entry.fullhash; + replacedInfo.addr = entry.addr; + replacedInfo.isVideo = videos_.find(entry.addr & 0x3FFFFFFF) != videos_.end(); + replacedInfo.isFinal = (entry.status & TexCacheEntry::STATUS_TO_SCALE) == 0; + replacedInfo.scaleFactor = scaleFactor; + replacedInfo.fmt = FromGX2Format(dstFmt); + + replacer_.NotifyTextureDecoded(replacedInfo, pixelData, decPitch, level, w, h); + } + } +#if 0 // TODO: mipmapping + if (IsFakeMipmapChange()) + context_->UpdateSubresource(texture, 0, nullptr, mapData, mapRowPitch, 0); + else + context_->UpdateSubresource(texture, level, nullptr, mapData, mapRowPitch, 0); +#endif + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, texture->surface.image, texture->surface.imageSize); +} + +bool TextureCacheGX2::GetCurrentTextureDebug(GPUDebugBuffer &buffer, int level) { + SetTexture(false); + if (!nextTexture_) + return false; + + // Apply texture may need to rebuild the texture if we're about to render, or bind a framebuffer. + TexCacheEntry *entry = nextTexture_; + ApplyTexture(); + + // TODO: Centralize. + if (entry->framebuffer) { + VirtualFramebuffer *vfb = entry->framebuffer; + buffer.Allocate(vfb->bufferWidth, vfb->bufferHeight, GPU_DBG_FORMAT_8888, false); + bool retval = draw_->CopyFramebufferToMemorySync(vfb->fbo, Draw::FB_COLOR_BIT, 0, 0, vfb->bufferWidth, vfb->bufferHeight, Draw::DataFormat::R8G8B8A8_UNORM, buffer.GetData(), vfb->bufferWidth, "GetCurrentTextureDebug"); + gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE); + // We may have blitted to a temp FBO. + framebufferManager_->RebindFramebuffer("GetCurrentTextureDebug"); + return retval; + } + + GX2Texture *texture = (GX2Texture *)entry->texturePtr; + if (!texture) + return false; + + if (texture->surface.format != GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8) { + // TODO: Support the other formats + return false; + } +#if 0 // TODO: mipmapping + int width = texture->surface.width >> level; + int height = texture->surface.height >> level; +#else + int width = texture->surface.width; + int height = texture->surface.height; +#endif + buffer.Allocate(width, height, GPU_DBG_FORMAT_8888); + + for (int y = 0; y < height; y++) { + memcpy(buffer.GetData() + 4 * width * y, (const uint8_t *)texture->surface.image + texture->surface.pitch * y, 4 * width); + } + + return true; +} diff --git a/GPU/GX2/TextureCacheGX2.h b/GPU/GX2/TextureCacheGX2.h new file mode 100644 index 000000000000..6fd2d6a6189c --- /dev/null +++ b/GPU/GX2/TextureCacheGX2.h @@ -0,0 +1,103 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include +#include + +#include "Common/CommonWindows.h" + +#include "GPU/GPU.h" +#include "GPU/GPUInterface.h" +#include "GPU/GX2/TextureScalerGX2.h" +#include "GPU/Common/TextureCacheCommon.h" + +struct VirtualFramebuffer; + +class FramebufferManagerGX2; +class DepalShaderCacheGX2; +class ShaderManagerGX2; + +class SamplerCacheGX2 { +public: + SamplerCacheGX2() {} + ~SamplerCacheGX2(); + GX2Sampler* GetOrCreateSampler(const SamplerCacheKey &key); + +private: + std::map cache_; +}; + +class TextureCacheGX2 : public TextureCacheCommon { +public: + TextureCacheGX2(Draw::DrawContext *draw); + ~TextureCacheGX2(); + + void StartFrame(); + + void SetFramebufferManager(FramebufferManagerGX2 *fbManager); + void SetDepalShaderCache(DepalShaderCacheGX2 *dpCache) { + depalShaderCache_ = dpCache; + } + void SetShaderManager(ShaderManagerGX2 *sm) { + shaderManager_ = sm; + } + + void ForgetLastTexture() override; + void InvalidateLastTexture(TexCacheEntry *entry = nullptr) override; + + void SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, SamplerCacheKey &key); + bool GetCurrentTextureDebug(GPUDebugBuffer &buffer, int level) override; + +protected: + void BindTexture(TexCacheEntry *entry) override; + void Unbind() override; + void ReleaseTexture(TexCacheEntry *entry, bool delete_them) override; + +private: + void LoadTextureLevel(TexCacheEntry &entry, ReplacedTexture &replaced, int level, int maxLevel, int scaleFactor, GX2SurfaceFormat dstFmt); + GX2SurfaceFormat GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; + TexCacheEntry::TexStatus CheckAlpha(const u32_le *pixelData, u32 dstFmt, int stride, int w, int h); + void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; + + void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) override; + void BuildTexture(TexCacheEntry *const entry) override; + + GX2ContextState *context_; + + GX2Texture *&GX2Tex(TexCacheEntry *entry) { + return (GX2Texture *&)entry->texturePtr; + } + + TextureScalerGX2 scaler; + + SamplerCacheGX2 samplerCache_; + + GX2Texture *lastBoundTexture; + + int decimationCounter_; + int texelsScaledThisFrame_; + int timesInvalidatedAllThisFrame_; + + FramebufferManagerGX2 *framebufferManagerGX2_; + DepalShaderCacheGX2 *depalShaderCache_; + ShaderManagerGX2 *shaderManager_; +}; + +GX2SurfaceFormat GetClutDestFormatGX2(GEPaletteFormat format); diff --git a/GPU/GX2/TextureScalerGX2.cpp b/GPU/GX2/TextureScalerGX2.cpp new file mode 100644 index 000000000000..6d69cf9092c5 --- /dev/null +++ b/GPU/GX2/TextureScalerGX2.cpp @@ -0,0 +1,59 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include + +#include +#include "Common/ColorConv.h" +#include "Common/ThreadPools.h" +#include "GPU/Common/TextureScalerCommon.h" +#include "GPU/GX2/TextureScalerGX2.h" +#include "GPU/GX2/GPU_GX2.h" + +#undef _1 + +int TextureScalerGX2::BytesPerPixel(u32 format) { + return format == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8 ? 4 : 2; +} + +u32 TextureScalerGX2::Get8888Format() { + return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; +} + +void TextureScalerGX2::ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) { + switch (format) { + case GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8: + dest = source; // already fine + break; + + case GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4: + GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + break; + + case GX2_SURFACE_FORMAT_UNORM_R5_G6_B5: + GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + break; + + case GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1: + GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + break; + + default: + dest = source; + ERROR_LOG(G3D, "iXBRZTexScaling: unsupported texture format"); + } +} diff --git a/GPU/GX2/TextureScalerGX2.h b/GPU/GX2/TextureScalerGX2.h new file mode 100644 index 000000000000..753b39827376 --- /dev/null +++ b/GPU/GX2/TextureScalerGX2.h @@ -0,0 +1,29 @@ +// Copyright (c) 2012- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include "Common/CommonTypes.h" +#include "GPU/Common/TextureScalerCommon.h" + +class TextureScalerGX2 : public TextureScalerCommon { +private: + // NOTE: We use GE formats, GX2 doesn't support 4444 + void ConvertTo8888(u32 format, u32* source, u32* &dest, int width, int height) override; + int BytesPerPixel(u32 format) override; + u32 Get8888Format() override; +}; diff --git a/GPU/GX2/VertexShaderGeneratorGX2.cpp b/GPU/GX2/VertexShaderGeneratorGX2.cpp new file mode 100644 index 000000000000..1aa03ee2f36f --- /dev/null +++ b/GPU/GX2/VertexShaderGeneratorGX2.cpp @@ -0,0 +1,143 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#include "GPU/Common/ShaderCommon.h" +#include "GPU/Common/ShaderUniforms.h" +#include "GPU/ge_constants.h" + +#include "GPU/GX2/VertexShaderGeneratorGX2.h" +#include "GPU/GX2/ShaderManagerGX2.h" + +#include +#include +#include + +#include "GPU/Vulkan/VertexShaderGeneratorVulkan.h" +#include + +using namespace GX2Gen; + +class VertexShaderGeneratorGX2 : private GX2VertexShaderEmitter { +public: + VertexShaderGeneratorGX2() {} + bool Supported(const VShaderID &id); + void Emit(const VShaderID &id, GX2VertexShader *vs); +}; + +bool VertexShaderGeneratorGX2::Supported(const VShaderID &id) { + VShaderID unsupported; + unsupported.SetBit(VS_BIT_LMODE); + unsupported.SetBit(VS_BIT_ENABLE_FOG); + unsupported.SetBit(VS_BIT_DO_TEXTURE_TRANSFORM); + unsupported.SetBit(VS_BIT_USE_HW_TRANSFORM); + unsupported.SetBit(VS_BIT_HAS_NORMAL); + unsupported.SetBit(VS_BIT_NORM_REVERSE); + unsupported.SetBit(VS_BIT_HAS_TEXCOORD); + unsupported.SetBit(VS_BIT_HAS_COLOR_TESS); + unsupported.SetBit(VS_BIT_HAS_TEXCOORD_TESS); + unsupported.SetBit(VS_BIT_NORM_REVERSE_TESS); + unsupported.SetBit(VS_BIT_HAS_NORMAL_TESS); + unsupported.SetBits(VS_BIT_UVGEN_MODE, 2, -1); + unsupported.SetBits(VS_BIT_UVPROJ_MODE, 2, -1); + unsupported.SetBits(VS_BIT_LS0, 2, -1); + unsupported.SetBits(VS_BIT_LS1, 2, -1); + unsupported.SetBits(VS_BIT_BONES, 3, -1); + unsupported.SetBit(VS_BIT_ENABLE_BONES); + unsupported.SetBits(VS_BIT_LIGHT0_COMP, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT0_TYPE, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT1_COMP, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT1_TYPE, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT2_COMP, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT2_TYPE, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT3_COMP, 2, -1); + unsupported.SetBits(VS_BIT_LIGHT3_TYPE, 2, -1); + unsupported.SetBits(VS_BIT_MATERIAL_UPDATE, 3, -1); + unsupported.SetBit(VS_BIT_SPLINE); + unsupported.SetBit(VS_BIT_LIGHT0_ENABLE); + unsupported.SetBit(VS_BIT_LIGHT1_ENABLE); + unsupported.SetBit(VS_BIT_LIGHT2_ENABLE); + unsupported.SetBit(VS_BIT_LIGHT3_ENABLE); + unsupported.SetBit(VS_BIT_LIGHTING_ENABLE); + unsupported.SetBits(VS_BIT_WEIGHT_FMTSCALE, 2, -1); + unsupported.SetBit(VS_BIT_FLATSHADE); + unsupported.SetBit(VS_BIT_BEZIER); + + return !(unsupported.d[0] & id.d[0]) && !(unsupported.d[1] & id.d[1]); +} + +void VertexShaderGeneratorGX2::Emit(const VShaderID &id, GX2VertexShader *vs) { + Reg pos = allocImportReg(VSInput::POSITION); + + GX2Emitter::KCacheRegs proj = KCacheRegs(UB_Bindings::Base, offsetof(UB_VS_FS_Base, proj), this); + if (id.Bit(VS_BIT_IS_THROUGH)) + proj = KCacheRegs(UB_Bindings::Base, offsetof(UB_VS_FS_Base, proj_through), this); + + MUL(___(x), pos(x), proj[0](x)); + MUL(___(y), pos(x), proj[0](y)); + MUL(___(z), pos(x), proj[0](z)); + MUL(___(w), pos(x), proj[0](w)); + ALU_LAST(); + MULADD(___(x), pos(y), proj[1](x), PV(x)); + MULADD(___(y), pos(y), proj[1](y), PV(y)); + MULADD(___(z), pos(y), proj[1](z), PV(z)); + MULADD(___(w), pos(y), proj[1](w), PV(w)); + ALU_LAST(); + MULADD(___(x), pos(z), proj[2](x), PV(x)); + MULADD(___(y), pos(z), proj[2](y), PV(y)); + MULADD(___(z), pos(z), proj[2](z), PV(z)); + MULADD(___(w), pos(z), proj[2](w), PV(w)); + ALU_LAST(); + ADD(pos(x), proj[3](x), PV(x)); + ADD(pos(y), proj[3](y), PV(y)); + ADD(pos(z), proj[3](z), PV(z)); + ADD(pos(w), proj[3](w), PV(w)); + ALU_LAST(); + + EXP_POS(pos); + + if (id.Bit(VS_BIT_DO_TEXTURE)) + EXP_PARAM(PSInput::COORDS, allocImportReg(VSInput::COORDS)(x, y, _1_, __), NO_BARRIER); + + if (id.Bit(VS_BIT_HAS_COLOR)) + EXP_PARAM(PSInput::COLOR0, allocImportReg(VSInput::COLOR0), NO_BARRIER); + + END_OF_PROGRAM(vs); +} +void GenerateVertexShaderGX2(const VShaderID &id, GX2VertexShader *vs) { + VertexShaderGeneratorGX2 vsGen; + if (vsGen.Supported(id)) { + vsGen.Emit(id, vs); +#if 0 + char buffer[0x20000]; + printf("\n### GPU Regs ###\n"); + GX2VertexShaderInfo(vs, buffer); + puts(buffer); + + printf("\n### ASM ###\n%s\n", VertexShaderDesc(id).c_str()); + DisassembleGX2Shader(vs->program, vs->size, buffer); + puts(buffer); + + printf("\n### glsl ###\n"); + GenerateVulkanGLSLVertexShader(id, buffer); + puts(buffer); +#endif + } else { + WARN_LOG(G3D, "unsupported VShaderID: \"%s\"", VertexShaderDesc(id).c_str()); + *vs = id.Bit(VS_BIT_USE_HW_TRANSFORM) ? id.Bit(VS_BIT_ENABLE_BONES) ? VShaderHWSkinGX2 : VShaderHWNoSkinGX2 + : VShaderSWGX2; + } +} diff --git a/GPU/GX2/VertexShaderGeneratorGX2.h b/GPU/GX2/VertexShaderGeneratorGX2.h new file mode 100644 index 000000000000..05acfff97539 --- /dev/null +++ b/GPU/GX2/VertexShaderGeneratorGX2.h @@ -0,0 +1,23 @@ +// Copyright (c) 2017- PPSSPP Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0 or later versions. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official git repository and contact information can be found at +// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. + +#pragma once + +#include +#include "GPU/Common/ShaderId.h" + +void GenerateVertexShaderGX2(const VShaderID &id, GX2VertexShader *vs); diff --git a/GPU/GX2/default.frag b/GPU/GX2/default.frag new file mode 100644 index 000000000000..fcf4219c5bf4 --- /dev/null +++ b/GPU/GX2/default.frag @@ -0,0 +1,699 @@ +#version 400 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_ARB_shading_language_420pack : enable + +//#define FLAT_SHADING +//#define BUG_PVR_SHADER_PRECISION_BAD +//#define BUG_PVR_SHADER_PRECISION_TERRIBLE + +layout(std140, binding = 1) uniform baseVars +{ + mat4 proj_mtx; + mat4 proj_through_mtx; + mat3x4 view_mtx; + mat3x4 world_mtx; + mat3x4 tex_mtx; + vec4 uvscaleoffset; + vec4 depthRange; + vec2 fogcoef; + float stencilReplace; + vec4 matambientalpha; + uint spline_counts; + uint depal_mask_shift_off_fmt; + int pad2; + int pad3; + vec3 fogcolor; + vec3 texenv; + ivec4 alphacolorref; + ivec4 alphacolormask; + vec3 blendFixA; + vec3 blendFixB; + vec4 texclamp; + vec2 texclampoff; +} base; + +layout(std140, binding = 5) uniform UB_FSID +{ + bool FS_BIT_CLEARMODE; + bool FS_BIT_DO_TEXTURE; + int FS_BIT_TEXFUNC; + bool FS_BIT_TEXALPHA; + bool FS_BIT_SHADER_DEPAL; + bool FS_BIT_SHADER_TEX_CLAMP; + bool FS_BIT_CLAMP_S; + bool FS_BIT_CLAMP_T; + bool FS_BIT_TEXTURE_AT_OFFSET; + bool FS_BIT_LMODE; + bool FS_BIT_ALPHA_TEST; + int FS_BIT_ALPHA_TEST_FUNC; + bool FS_BIT_ALPHA_AGAINST_ZERO; + bool FS_BIT_COLOR_TEST; + int FS_BIT_COLOR_TEST_FUNC; + bool FS_BIT_COLOR_AGAINST_ZERO; + bool FS_BIT_ENABLE_FOG; + bool FS_BIT_DO_TEXTURE_PROJ; + bool FS_BIT_COLOR_DOUBLE; + int FS_BIT_STENCIL_TO_ALPHA; + int FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE; + int FS_BIT_REPLACE_LOGIC_OP_TYPE; + int FS_BIT_REPLACE_BLEND; + int FS_BIT_BLENDEQ; + int FS_BIT_BLENDFUNC_A; + int FS_BIT_BLENDFUNC_B; + bool FS_BIT_FLATSHADE; + bool FS_BIT_BGRA_TEXTURE; + bool GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT; + bool GPU_SUPPORTS_DEPTH_CLAMP; + bool GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT; + bool GPU_SUPPORTS_ACCURATE_DEPTH; +}; + +#define GE_BLENDMODE_MUL_AND_ADD 0 +#define GE_BLENDMODE_MUL_AND_SUBTRACT 1 +#define GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE 2 +#define GE_BLENDMODE_MIN 3 +#define GE_BLENDMODE_MAX 4 +#define GE_BLENDMODE_ABSDIFF 5 + +#define GE_DSTBLEND_SRCCOLOR 0 +#define GE_DSTBLEND_INVSRCCOLOR 1 +#define GE_DSTBLEND_SRCALPHA 2 +#define GE_DSTBLEND_INVSRCALPHA 3 +#define GE_DSTBLEND_DSTALPHA 4 +#define GE_DSTBLEND_INVDSTALPHA 5 +#define GE_DSTBLEND_DOUBLESRCALPHA 6 +#define GE_DSTBLEND_DOUBLEINVSRCALPHA 7 +#define GE_DSTBLEND_DOUBLEDSTALPHA 8 +#define GE_DSTBLEND_DOUBLEINVDSTALPHA 9 +#define GE_DSTBLEND_FIXB 10 + +#define GE_SRCBLEND_DSTCOLOR 0 +#define GE_SRCBLEND_INVDSTCOLOR 1 +#define GE_SRCBLEND_SRCALPHA 2 +#define GE_SRCBLEND_INVSRCALPHA 3 +#define GE_SRCBLEND_DSTALPHA 4 +#define GE_SRCBLEND_INVDSTALPHA 5 +#define GE_SRCBLEND_DOUBLESRCALPHA 6 +#define GE_SRCBLEND_DOUBLEINVSRCALPHA 7 +#define GE_SRCBLEND_DOUBLEDSTALPHA 8 +#define GE_SRCBLEND_DOUBLEINVDSTALPHA 9 +#define GE_SRCBLEND_FIXA 10 + +#define GE_COMP_NEVER 0 +#define GE_COMP_ALWAYS 1 +#define GE_COMP_EQUAL 2 +#define GE_COMP_NOTEQUAL 3 +#define GE_COMP_LESS 4 +#define GE_COMP_LEQUAL 5 +#define GE_COMP_GREATER 6 +#define GE_COMP_GEQUAL 7 + +#define GE_TEXFUNC_MODULATE 0 +#define GE_TEXFUNC_DECAL 1 +#define GE_TEXFUNC_BLEND 2 +#define GE_TEXFUNC_REPLACE 3 +#define GE_TEXFUNC_ADD 4 +#define GE_TEXFUNC_UNKNOWN1 5 +#define GE_TEXFUNC_UNKNOWN2 6 +#define GE_TEXFUNC_UNKNOWN3 7 + +#define REPLACE_BLEND_NO 0 +#define REPLACE_BLEND_STANDARD +#define REPLACE_BLEND_PRE_SRC 2 +#define REPLACE_BLEND_PRE_SRC_2X_ALPHA 3 +#define REPLACE_BLEND_2X_ALPHA 4 +#define REPLACE_BLEND_2X_SRC 5 +#define REPLACE_BLEND_COPY_FBO 6 + +#define REPLACE_ALPHA_NO 0 +#define REPLACE_ALPHA_YES 1 +#define REPLACE_ALPHA_DUALSOURCE 2 + +#define STENCIL_VALUE_UNIFORM 0 +#define STENCIL_VALUE_ZERO 1 +#define STENCIL_VALUE_ONE 2 +#define STENCIL_VALUE_KEEP 3 +#define STENCIL_VALUE_INVERT 4 +#define STENCIL_VALUE_INCR_4 5 +#define STENCIL_VALUE_INCR_8 6 +#define STENCIL_VALUE_DECR_4 7 +#define STENCIL_VALUE_DECR_8 8 + +#define LOGICOPTYPE_NORMAL 0 +#define LOGICOPTYPE_ONE 1 +#define LOGICOPTYPE_INVERT 2 + +#ifdef FLAT_SHADING +#define shading flat +#else +#define shading +#endif + +layout(binding = 0) uniform sampler2D tex; +layout(binding = 1) uniform sampler2D fbotex; +layout(binding = 2) uniform sampler2D pal; + +layout(location = 0) in vec3 v_texcoord; +layout(location = 1) shading in vec4 v_color0; +layout(location = 2) shading in vec3 v_color1; +layout(location = 3) in float v_fogdepth; + +layout(location = 0) out vec4 fragColor0; +layout(location = 1) out vec4 fragColor1; +out float gl_FragDepth; + +int roundAndScaleTo255i(in float x) +{ + return int(floor(x * 255.0 + 0.5)); +} + +ivec3 roundAndScaleTo255i(in vec3 x) +{ + return ivec3(floor(x * 255.0 + 0.5)); +} + +// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one. +#ifdef BUG_PVR_SHADER_PRECISION_BAD +float mymod(float a, float b) +{ + return a - b * floor(a / b); +} +#define mod mymod +#endif + +void main() +{ + vec4 v = v_color0; + if (!FS_BIT_CLEARMODE) // Clear mode does not allow any fancy shading. + { + if (FS_BIT_DO_TEXTURE) + { + vec3 texcoord = v_texcoord; + vec4 t, t1, t2, t3; +#ifndef BUG_PVR_SHADER_PRECISION_TERRIBLE + // TODO: Not sure the right way to do this for projection. + // This path destroys resolution on older PowerVR no matter what I do, + // so we disable it on SGX 540 and lesser, and live with the consequences. + if (FS_BIT_SHADER_TEX_CLAMP) + { + // We may be clamping inside a larger surface (tex = 64x64, buffer=480x272). + // We may also be wrapping in such a surface, or either one in a too-small surface. + // Obviously, clamping to a smaller surface won't work. But better to clamp to something. + if (FS_BIT_DO_TEXTURE_PROJ) + texcoord.xy /= v_texcoord.z; + + if (FS_BIT_CLAMP_S) + texcoord.x = clamp(texcoord.x, base.texclamp.z, base.texclamp.x - base.texclamp.z); + else + texcoord.x = mod(texcoord.x, base.texclamp.x); + + if (FS_BIT_CLAMP_T) + texcoord.y = clamp(texcoord.y, base.texclamp.w, base.texclamp.y - base.texclamp.w); + else + texcoord.y = mod(texcoord.y, base.texclamp.y); + + if (FS_BIT_TEXTURE_AT_OFFSET) + texcoord.xy += base.texclampoff.xy; + } + + if (FS_BIT_DO_TEXTURE_PROJ && !FS_BIT_SHADER_TEX_CLAMP) +#else + if (FS_BIT_DO_TEXTURE_PROJ) +#endif + { + t = textureProj(tex, texcoord); + + if (FS_BIT_SHADER_DEPAL) + { + t1 = textureProjOffset(tex, texcoord, ivec2(1, 0)); + t2 = textureProjOffset(tex, texcoord, ivec2(0, 1)); + t3 = textureProjOffset(tex, texcoord, ivec2(1, 1)); + } + } + else + { + t = texture(tex, texcoord.xy); + if (FS_BIT_SHADER_DEPAL) + { + t1 = textureOffset(tex, texcoord.xy, ivec2(1, 0)); + t2 = textureOffset(tex, texcoord.xy, ivec2(0, 1)); + t3 = textureOffset(tex, texcoord.xy, ivec2(1, 1)); + } + } + + if (FS_BIT_SHADER_DEPAL) + { + uint depalMask = (base.depal_mask_shift_off_fmt & 0xFF); + uint depalShift = (base.depal_mask_shift_off_fmt >> 8) & 0xFF; + uint depalOffset = ((base.depal_mask_shift_off_fmt >> 16) & 0xFF) << 4; + uint depalFmt = (base.depal_mask_shift_off_fmt >> 24) & 0x3; + bool bilinear = (base.depal_mask_shift_off_fmt >> 31) != 0; + vec2 fraction = fract(texcoord.xy * vec2(textureSize(tex, 0).xy) /* -0.5 ? */); + uvec4 col; + uint index0; + uint index1; + uint index2; + uint index3; + switch (depalFmt) // We might want to include fmt in the shader ID if this is a performance issue. + { + case 0: // 565 + col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0); + index0 = (col.b << 11) | (col.g << 5) | (col.r); + if (bilinear) + { + col = uvec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0); + index1 = (col.b << 11) | (col.g << 5) | (col.r); + col = uvec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0); + index2 = (col.b << 11) | (col.g << 5) | (col.r); + col = uvec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0); + index3 = (col.b << 11) | (col.g << 5) | (col.r); + } + break; + case 1: // 5551 + col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0)); + index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r); + if (bilinear) + { + col = uvec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0)); + index1 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r); + col = uvec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0)); + index2 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r); + col = uvec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0)); + index3 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r); + } + break; + case 2: // 4444 + col = uvec4(t.rgba * vec4(15.99, 15.99, 15.99, 15.99)); + index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r); + if (bilinear) + { + col = uvec4(t1.rgba * vec4(15.99, 15.99, 15.99, 15.99)); + index1 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r); + col = uvec4(t2.rgba * vec4(15.99, 15.99, 15.99, 15.99)); + index2 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r); + col = uvec4(t3.rgba * vec4(15.99, 15.99, 15.99, 15.99)); + index3 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r); + } + break; + case 3: // 8888 + col = uvec4(t.rgba * vec4(255.99, 255.99, 255.99, 255.99)); + index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r); + if (bilinear) + { + col = uvec4(t1.rgba * vec4(255.99, 255.99, 255.99, 255.99)); + index1 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r); + col = uvec4(t2.rgba * vec4(255.99, 255.99, 255.99, 255.99)); + index2 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r); + col = uvec4(t3.rgba * vec4(255.99, 255.99, 255.99, 255.99)); + index3 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r); + } + break; + }; + index0 = ((index0 >> depalShift) & depalMask) | depalOffset; + t = texelFetch(pal, ivec2(index0, 0), 0); + if (bilinear) + { + index1 = ((index1 >> depalShift) & depalMask) | depalOffset; + index2 = ((index2 >> depalShift) & depalMask) | depalOffset; + index3 = ((index3 >> depalShift) & depalMask) | depalOffset; + t1 = texelFetch(pal, ivec2(index1, 0), 0); + t2 = texelFetch(pal, ivec2(index2, 0), 0); + t3 = texelFetch(pal, ivec2(index3, 0), 0); + t = mix(t, t1, fraction.x); + t2 = mix(t2, t3, fraction.x); + t = mix(t, t2, fraction.y); + } + } + + if (FS_BIT_TEXALPHA) // texfmt == RGBA + { + switch (FS_BIT_TEXFUNC) + { + case GE_TEXFUNC_MODULATE: + v *= t; + break; + case GE_TEXFUNC_DECAL: + v.rgb = mix(v.rgb, t.rgb, t.a); + break; + case GE_TEXFUNC_BLEND: + v = vec4(mix(v.rgb, base.texenv.rgb, t.rgb), v.a * t.a); + break; + case GE_TEXFUNC_REPLACE: + v = t; + break; + case GE_TEXFUNC_ADD: + case GE_TEXFUNC_UNKNOWN1: + case GE_TEXFUNC_UNKNOWN2: + case GE_TEXFUNC_UNKNOWN3: + v = vec4(v.rgb + t.rgb, v.a * t.a); + break; + } + } + else // texfmt == RGB + { + switch (FS_BIT_TEXFUNC) + { + case GE_TEXFUNC_MODULATE: + v.rgb *= t.rgb; + break; + case GE_TEXFUNC_DECAL: + v.rgb = t.rgb; + break; + case GE_TEXFUNC_BLEND: + v = vec4(mix(v.rgb, base.texenv.rgb, t.rgb), v.a); + break; + case GE_TEXFUNC_REPLACE: + v = vec4(t.rgb, v.a); + break; + case GE_TEXFUNC_ADD: + case GE_TEXFUNC_UNKNOWN1: + case GE_TEXFUNC_UNKNOWN2: + case GE_TEXFUNC_UNKNOWN3: + v = vec4(v.rgb + t.rgb, v.a); + break; + } + } + } + // Secondary color for specular on top of texture + v += vec4(v_color1, 0.0); + + if (FS_BIT_ALPHA_TEST) + { + if (FS_BIT_ALPHA_AGAINST_ZERO) + { + // When testing against 0 (extremely common), we can avoid some math. + // 0.002 is approximately half of 1.0 / 255.0. + if (FS_BIT_ALPHA_TEST_FUNC == GE_COMP_NOTEQUAL || FS_BIT_ALPHA_TEST_FUNC == GE_COMP_GREATER) + { + if (v.a < 0.002) + discard; + } + else if (FS_BIT_ALPHA_TEST_FUNC == GE_COMP_NEVER) + // NEVER has been logged as used by games, although it makes little sense - statically failing. + // Maybe we could discard the drawcall, but it's pretty rare. Let's just statically discard here. + discard; + else if (v.a > 0.002) + // Anything else is a test for == 0. Happens sometimes, actually... + discard; + } + else + { + switch (FS_BIT_ALPHA_TEST_FUNC) + { + case GE_COMP_EQUAL: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) == base.alphacolorref.a)) discard; + break; + case GE_COMP_NOTEQUAL: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) != base.alphacolorref.a)) discard; + break; + case GE_COMP_LESS: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) < base.alphacolorref.a)) discard; + break; + case GE_COMP_LEQUAL: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) <= base.alphacolorref.a)) discard; + break; + case GE_COMP_GREATER: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) > base.alphacolorref.a)) discard; + break; + case GE_COMP_GEQUAL: + if (!((roundAndScaleTo255i(v.a) & base.alphacolormask.a) >= base.alphacolorref.a)) discard; + break; + // This means NEVER. See above. + case GE_COMP_NEVER: + case GE_COMP_ALWAYS: + default: + discard; + break; + } + } + } + + if (FS_BIT_COLOR_TEST) + { + if (FS_BIT_COLOR_AGAINST_ZERO) + { + // When testing against 0 (common), we can avoid some math. + // Have my doubts that this special case is actually worth it, but whatever. + // 0.002 is approximately half of 1.0 / 255.0. + if (FS_BIT_COLOR_TEST_FUNC == GE_COMP_NOTEQUAL) + { + if (v.r + v.g + v.b < 0.002) discard; + } + else if (FS_BIT_COLOR_TEST_FUNC != GE_COMP_NEVER) + { + // Anything else is a test for == 0. + if (v.r + v.g + v.b > 0.002) discard; + } + else + { + // NEVER has been logged as used by games, although it makes little sense - statically failing. + // Maybe we could discard the drawcall, but it's pretty rare. Let's just statically discard here. + discard; + } + } + else + { + if (FS_BIT_COLOR_TEST_FUNC == GE_COMP_EQUAL) + { + if (!((roundAndScaleTo255i(v.rgb) & base.alphacolormask.rgb) == (base.alphacolorref.rgb & base.alphacolormask.rgb))) + discard; + } + else if (FS_BIT_COLOR_TEST_FUNC == GE_COMP_NOTEQUAL) + { + if (!((roundAndScaleTo255i(v.rgb) & base.alphacolormask.rgb) != (base.alphacolorref.rgb & base.alphacolormask.rgb))) + discard; + } + else + discard; + } + } + + // Color doubling happens after the color test. + if (FS_BIT_COLOR_DOUBLE && FS_BIT_REPLACE_BLEND == REPLACE_BLEND_2X_SRC) + v.rgb = v.rgb * 4.0; + else if (FS_BIT_COLOR_DOUBLE || FS_BIT_REPLACE_BLEND == REPLACE_BLEND_2X_SRC) + v.rgb = v.rgb * 2.0; + + if (FS_BIT_ENABLE_FOG) + { + float fogCoef = clamp(v_fogdepth, 0.0, 1.0); + v.rgb = mix(base.fogcolor.rgb, v, fogCoef); + // v.x = v_depth; + } + + if (FS_BIT_REPLACE_BLEND == REPLACE_BLEND_PRE_SRC || FS_BIT_REPLACE_BLEND == REPLACE_BLEND_PRE_SRC_2X_ALPHA) + { + vec3 srcFactor; + switch (FS_BIT_BLENDFUNC_A) + { + case GE_SRCBLEND_SRCALPHA: + srcFactor = vec3(v.a); + break; + case GE_SRCBLEND_INVSRCALPHA: + srcFactor = vec3(1.0 - v.a); + break; + case GE_SRCBLEND_DOUBLESRCALPHA: + srcFactor = vec3(v.a * 2.0); + break; + case GE_SRCBLEND_DOUBLEINVSRCALPHA: + srcFactor = vec3(1.0 - v.a * 2.0); + break; + case GE_SRCBLEND_FIXA: + srcFactor = vec3(base.blendFixA); + break; + default: + srcFactor = vec3(1.0); + } + v.rgb = v.rgb * srcFactor; + } + + if (FS_BIT_REPLACE_BLEND == REPLACE_BLEND_COPY_FBO) + { + // lowp vec4 destColor; + vec4 destColor = texelFetch(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0); + vec3 srcFactor; + vec3 dstFactor; + switch (FS_BIT_BLENDFUNC_A) + { + case GE_SRCBLEND_DSTCOLOR: + srcFactor = destColor.rgb; + break; + case GE_SRCBLEND_INVDSTCOLOR: + srcFactor = vec3(1.0) - destColor.rgb; + break; + case GE_SRCBLEND_SRCALPHA: + srcFactor = vec3(v.a); + break; + case GE_SRCBLEND_INVSRCALPHA: + srcFactor = vec3(1.0 - v.a); + break; + case GE_SRCBLEND_DSTALPHA: + srcFactor = vec3(destColor.a); + break; + case GE_SRCBLEND_INVDSTALPHA: + srcFactor = vec3(1.0 - destColor.a); + break; + case GE_SRCBLEND_DOUBLESRCALPHA: + srcFactor = vec3(v.a * 2.0); + break; + case GE_SRCBLEND_DOUBLEINVSRCALPHA: + srcFactor = vec3(1.0 - v.a * 2.0); + break; + case GE_SRCBLEND_DOUBLEDSTALPHA: + srcFactor = vec3(destColor.a * 2.0); + break; + case GE_SRCBLEND_DOUBLEINVDSTALPHA: + srcFactor = vec3(1.0 - destColor.a * 2.0); + break; + case GE_SRCBLEND_FIXA: + srcFactor = base.blendFixA; + break; + default: + srcFactor = vec3(1.0); + break; + } + switch (FS_BIT_BLENDFUNC_B) + { + case GE_DSTBLEND_SRCCOLOR: + dstFactor = v.rgb; + break; + case GE_DSTBLEND_INVSRCCOLOR: + dstFactor = (vec3(1.0) - v.rgb); + break; + case GE_DSTBLEND_SRCALPHA: + dstFactor = vec3(v.a); + break; + case GE_DSTBLEND_INVSRCALPHA: + dstFactor = vec3(1.0 - v.a); + break; + case GE_DSTBLEND_DSTALPHA: + dstFactor = vec3(destColor.a); + break; + case GE_DSTBLEND_INVDSTALPHA: + dstFactor = vec3(1.0 - destColor.a); + break; + case GE_DSTBLEND_DOUBLESRCALPHA: + dstFactor = vec3(v.a * 2.0); + break; + case GE_DSTBLEND_DOUBLEINVSRCALPHA: + dstFactor = vec3(1.0 - v.a * 2.0); + break; + case GE_DSTBLEND_DOUBLEDSTALPHA: + dstFactor = vec3(destColor.a * 2.0); + break; + case GE_DSTBLEND_DOUBLEINVDSTALPHA: + dstFactor = vec3(1.0 - destColor.a * 2.0); + break; + case GE_DSTBLEND_FIXB: + dstFactor = base.blendFixB; + break; + default: + dstFactor = vec3(0.0); + break; + } + + switch (FS_BIT_BLENDEQ) + { + case GE_BLENDMODE_MUL_AND_ADD: + v.rgb = v.rgb * srcFactor + destColor.rgb * dstFactor; + break; + case GE_BLENDMODE_MUL_AND_SUBTRACT: + v.rgb = v.rgb * srcFactor - destColor.rgb * dstFactor; + break; + case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE: + v.rgb = destColor.rgb * dstFactor - v.rgb * srcFactor; + break; + case GE_BLENDMODE_MIN: + v.rgb = min(v.rgb, destColor.rgb); + break; + case GE_BLENDMODE_MAX: + v.rgb = max(v.rgb, destColor.rgb); + break; + case GE_BLENDMODE_ABSDIFF: + v.rgb = abs(v.rgb - destColor.rgb); + break; + } + } + + if (FS_BIT_REPLACE_BLEND == REPLACE_BLEND_2X_ALPHA || FS_BIT_REPLACE_BLEND == REPLACE_BLEND_PRE_SRC_2X_ALPHA) + v.a = v.a * 2.0; + } + + float replacedAlpha = 0.0; + if (FS_BIT_STENCIL_TO_ALPHA != REPLACE_ALPHA_NO) + { + switch (FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE) + { + case STENCIL_VALUE_UNIFORM: + replacedAlpha = base.stencilReplace; + break; + case STENCIL_VALUE_ZERO: + replacedAlpha = 0.0; + break; + case STENCIL_VALUE_ONE: + case STENCIL_VALUE_INVERT: + // In invert, we subtract by one, but we want to output one here. + replacedAlpha = 1.0; + break; + case STENCIL_VALUE_INCR_4: + case STENCIL_VALUE_DECR_4: + // We're adding/subtracting, just by the smallest value in 4-bit. + replacedAlpha = 1.0 / 15.0; + break; + case STENCIL_VALUE_INCR_8: + case STENCIL_VALUE_DECR_8: + // We're adding/subtracting, just by the smallest value in 8-bit. + replacedAlpha = 1.0 / 255.0; + break; + case STENCIL_VALUE_KEEP: + // Do nothing. We'll mask out the alpha using color mask. + break; + } + } + + switch (FS_BIT_STENCIL_TO_ALPHA) + { + case REPLACE_ALPHA_DUALSOURCE: + fragColor0 = vec4(v.rgb, replacedAlpha); + fragColor1 = vec4(0.0, 0.0, 0.0, v.a); + break; + case REPLACE_ALPHA_YES: + fragColor0 = vec4(v.rgb, replacedAlpha); + break; + case REPLACE_ALPHA_NO: + fragColor0 = v; + break; + default: + // Bad stencil - to - alpha type, corrupt ID ? + discard; + } + + switch (FS_BIT_REPLACE_LOGIC_OP_TYPE) + { + case LOGICOPTYPE_ONE: + fragColor0.rgb = vec3(1.0, 1.0, 1.0); + break; + case LOGICOPTYPE_INVERT: + fragColor0.rgb = vec3(1.0, 1.0, 1.0) - fragColor0.rgb; + break; + case LOGICOPTYPE_NORMAL: + break; + default: + // Bad logic op type, corrupt ID ? + discard; + } + + if (GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT) + { + float scale = 65535.0; + if(GPU_SUPPORTS_ACCURATE_DEPTH) + { + if (GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT) + scale *= 256.0; + else if (!GPU_SUPPORTS_DEPTH_CLAMP) + scale *= 4.0; + } + float offset = mod(scale - 1, 2.0) * 0.5; + gl_FragDepth = (floor((gl_FragCoord.z * scale) - offset) + offset) / scale; + } +} diff --git a/GPU/GX2/default.vert b/GPU/GX2/default.vert new file mode 100644 index 000000000000..b22dfb00a8d1 --- /dev/null +++ b/GPU/GX2/default.vert @@ -0,0 +1,373 @@ +#version 150 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_ARB_shading_language_420pack : enable + +// DEFAULT 0 +// HW_SKIN 1 +// HW_NOSKIN 2 +// SW 3 + +#define BUILD_TYPE 0 +//#define FLAT_SHADING + +struct light_t +{ + int COMP; + int TYPE; + int ENABLE; +}; + +layout(std140, binding = 4) uniform UB_VSID +{ + light_t VS_BIT_LIGHT[4]; + bool VS_BIT_LMODE; + bool VS_BIT_IS_THROUGH; + bool VS_BIT_ENABLE_FOG; + bool VS_BIT_HAS_COLOR; + bool VS_BIT_DO_TEXTURE; + bool VS_BIT_DO_TEXTURE_TRANSFORM; + bool VS_BIT_USE_HW_TRANSFORM; + bool VS_BIT_HAS_NORMAL; + bool VS_BIT_NORM_REVERSE; + bool VS_BIT_HAS_TEXCOORD; + bool VS_BIT_HAS_COLOR_TESS; + bool VS_BIT_HAS_TEXCOORD_TESS; + bool VS_BIT_NORM_REVERSE_TESS; + int VS_BIT_UVGEN_MODE; + int VS_BIT_UVPROJ_MODE; + int VS_BIT_LS0; + int VS_BIT_LS1; + int VS_BIT_BONES; + bool VS_BIT_ENABLE_BONES; + int VS_BIT_MATERIAL_UPDATE; + bool VS_BIT_SPLINE; + bool VS_BIT_LIGHTING_ENABLE; + int VS_BIT_WEIGHT_FMTSCALE; + bool VS_BIT_FLATSHADE; + bool VS_BIT_BEZIER; + bool GPU_ROUND_DEPTH_TO_16BIT; +}; + + +layout(std140, binding = 1) uniform baseVars +{ + mat4 proj_mtx; + mat4 proj_through_mtx; + mat3x4 view_mtx; + mat3x4 world_mtx; + mat3x4 tex_mtx; + vec4 uvscaleoffset; + vec4 depthRange; + vec2 fogcoef; + float stencilReplace; + vec4 matambientalpha; + uint spline_counts; + uint depal_mask_shift_off_fmt; + int pad2; + int pad3; + vec3 fogcolor; + vec3 texenv; + ivec4 alphacolorref; + ivec4 alphacolormask; + vec3 blendFixA; + vec3 blendFixB; + vec4 texclamp; + vec2 texclampoff; +} base; + +layout(std140, binding = 2) uniform lightVars +{ + vec4 u_ambient; + vec3 matdiffuse; + vec4 matspecular; + vec3 matemissive; + vec3 pos[4]; + vec3 dir[4]; + vec3 att[4]; + vec2 angle_spotCoef[4]; + vec3 ambient[4]; + vec3 diffuse[4]; + vec3 specular[4]; +} light; + +layout(std140, binding = 3) uniform boneVars +{ + mat3x4 m[8]; +} bone; + +layout(location = 0) in vec4 position; +layout(location = 1) in vec3 texcoord; +layout(location = 2) in vec4 color0; +layout(location = 3) in vec3 color1; +layout(location = 4) in vec3 normal; +layout(location = 5) in vec4 weight0; +layout(location = 6) in vec4 weight1; + +out gl_PerVertex { vec4 gl_Position; }; + +#define GE_LIGHTTYPE_DIRECTIONAL 0 +#define GE_LIGHTTYPE_POINT 1 +#define GE_LIGHTTYPE_SPOT 2 +#define GE_LIGHTTYPE_UNKNOWN 3 +#define GE_LIGHTCOMP_ONLYDIFFUSE 0 +#define GE_LIGHTCOMP_BOTH 1 +#define GE_LIGHTCOMP_BOTHWITHPOWDIFFUSE 2 + +#define GE_TEXMAP_TEXTURE_COORDS 0 +#define GE_TEXMAP_TEXTURE_MATRIX 1 +#define GE_TEXMAP_ENVIRONMENT_MAP 2 +#define GE_TEXMAP_UNKNOWN 3 + +#define GE_PROJMAP_POSITION 0 +#define GE_PROJMAP_UV 1 +#define GE_PROJMAP_NORMALIZED_NORMAL 2 +#define GE_PROJMAP_NORMAL 3 + +#ifdef FLAT_SHADING +#define shading flat +#else +#define shading +#endif + +layout(location = 0) out vec3 v_texcoord; +layout(location = 1) shading out vec4 v_color0; +layout(location = 2) shading out vec3 v_color1; +layout(location = 3) out float v_fogdepth; + +// DEFAULT 0 +// HW_SKIN 1 +// HW_NOSKIN 2 +// SW 3 + +#if BUILD_TYPE == 1 +#define VS_BIT_USE_HW_TRANSFORM true +#define VS_BIT_ENABLE_BONES true +#elif BUILD_TYPE == 2 +#define VS_BIT_USE_HW_TRANSFORM true +#define VS_BIT_ENABLE_BONES false +#elif BUILD_TYPE == 3 +#define VS_BIT_USE_HW_TRANSFORM false +#endif + +void main() +{ + v_color1 = vec3(0.0); + if (VS_BIT_USE_HW_TRANSFORM) + { + vec4 pos = vec4(position.xyz, 1.0); + vec4 nrm = vec4(0.0, 0.0, 1.0, 0.0); + if (VS_BIT_HAS_NORMAL) + nrm.xyz = normal; + if (VS_BIT_NORM_REVERSE) + nrm.xyz = -nrm.xyz; + if (VS_BIT_ENABLE_BONES) + { + float weights[8] = float[8](weight0.x, weight0.y, weight0.z, weight0.w, weight1.x, weight1.y, weight1.z, weight1.w); + mat3x4 skinMatrix = weight0.x * bone.m[0]; + for (int i = 1; i < VS_BIT_BONES + 1; i++) + skinMatrix += weights[i] * bone.m[i]; + + pos.xyz = pos * skinMatrix; + nrm.xyz = nrm * skinMatrix; + } + // Step 1: World Transform + vec4 worldpos = vec4(pos * base.world_mtx, 1.0); + mediump vec3 worldnormal = normalize(nrm * base.world_mtx); + vec4 viewPos = vec4(worldpos * base.view_mtx, 1.0); + // Final view and projection transforms. + gl_Position = base.proj_mtx * viewPos; + + // Calculate lights if needed. If shade mapping is enabled, lights may need to be + // at least partially calculated. + if (VS_BIT_LIGHTING_ENABLE) + { + // TODO: Declare variables for dots for shade mapping if needed + vec4 ambient = base.matambientalpha; + vec3 diffuse = light.matdiffuse; + vec3 specular = light.matspecular.rgb; + + if (VS_BIT_HAS_COLOR) + { + if (bool(VS_BIT_MATERIAL_UPDATE & 1)) + ambient = color0; + + if (bool(VS_BIT_MATERIAL_UPDATE & 2)) + diffuse = color0.rgb; + + if (bool(VS_BIT_MATERIAL_UPDATE & 4)) + specular = color0.rgb; + } + + vec4 lightSum0 = light.u_ambient * ambient + vec4(light.matemissive, 0.0); + vec3 lightSum1 = vec3(0.0); + + for (int i = 0; i < 4; i++) + { + if (!(true && bool(VS_BIT_LIGHT[i].ENABLE))) + continue; + + vec3 toLight = light.pos[i]; + float lightScale; // Attenuation + + if (VS_BIT_LIGHT[i].TYPE == GE_LIGHTTYPE_DIRECTIONAL) + lightScale = 1.0; + else + { + // We prenormalize light positions for directional lights. + float distance; + toLight -= worldpos.xyz; + distance = length(toLight); + toLight /= distance; + lightScale = clamp(1.0 / dot(light.att[i], vec3(1.0, distance, distance * distance)), 0.0, 1.0); + } + + if (VS_BIT_LIGHT[i].TYPE >= GE_LIGHTTYPE_SPOT) + { + float angle = dot(normalize(light.dir[i]), toLight); + if (angle >= light.angle_spotCoef[i].x) + lightScale *= pow(angle, light.angle_spotCoef[i].y); + else + lightScale = 0.0; + } + + // pow(0.0, 0.0) may be undefined, but the PSP seems to treat it as 1.0. + // Seen in Tales of the World: Radiant Mythology (#2424.) + mediump float doti = max(dot(toLight, worldnormal), 0.0000001); // smallest positive mediump is 0.00000006 + if (VS_BIT_LIGHT[i].COMP == GE_LIGHTCOMP_BOTHWITHPOWDIFFUSE) + doti = pow(doti, light.matspecular.a); // does this only apply to lightSum0 ? + + lightSum0.rgb += (light.ambient[i] * ambient.rgb + diffuse * light.diffuse[i] * doti) * lightScale; + + // specular + if (VS_BIT_LIGHT[i].COMP != GE_LIGHTCOMP_ONLYDIFFUSE) + { + doti = dot(normalize(toLight + vec3(0.0, 0.0, 1.0)), worldnormal); + if (doti > 0.0) + lightSum1 += light.specular[i] * specular * (pow(doti, light.matspecular.a) * lightScale); + } + } + + // Sum up ambient, emissive here. + if (VS_BIT_LMODE) + { + v_color0 = clamp(lightSum0, 0.0, 1.0); + v_color1 = clamp(lightSum1, 0.0, 1.0); + } + else + v_color0 = clamp(clamp(lightSum0, 0.0, 1.0) + vec4(lightSum1, 0.0), 0.0, 1.0); + } + else + { + // Lighting doesn't affect color. + if (VS_BIT_HAS_COLOR) + v_color0 = color0; + else + v_color0 = base.matambientalpha; + } + + // Step 3: UV generation + if (VS_BIT_DO_TEXTURE) + { + switch (VS_BIT_UVGEN_MODE) + { + case GE_TEXMAP_TEXTURE_COORDS: // Scale-offset. Easy. + case GE_TEXMAP_UNKNOWN: // Not sure what this is, but Riviera uses it. Treating as coords works. + if (!VS_BIT_IS_THROUGH) + { + if (VS_BIT_HAS_TEXCOORD) + v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy, 0.0); + else + v_texcoord = vec3(0.0); + } + else + { + if (VS_BIT_HAS_TEXCOORD) + v_texcoord = vec3(texcoord.xy * base.uvscaleoffset.xy + base.uvscaleoffset.zw, 0.0); + else + v_texcoord = vec3(base.uvscaleoffset.zw, 0.0); + } + break; + + case GE_TEXMAP_TEXTURE_MATRIX: // Projection mapping. + { + vec4 temp_tc; + switch (VS_BIT_UVPROJ_MODE) + { + case GE_PROJMAP_POSITION: // Use model space XYZ as source + temp_tc = vec4(position.xyz, 1.0); + break; + case GE_PROJMAP_UV: // Use unscaled UV as source + { + // scaleUV is false here. + if (VS_BIT_HAS_TEXCOORD) + temp_tc = vec4(texcoord.xy, 0.0, 1.0); + else + temp_tc = vec4(0.0, 0.0, 0.0, 1.0); + } + break; + case GE_PROJMAP_NORMALIZED_NORMAL: // Use normalized transformed normal as source + if (VS_BIT_HAS_NORMAL) + temp_tc = vec4(normalize((VS_BIT_NORM_REVERSE ? -normal : normal)), 1.0); + else + temp_tc = vec4(0.0, 0.0, 1.0, 1.0); + break; + case GE_PROJMAP_NORMAL: // Use non-normalized transformed normal as source + if (VS_BIT_HAS_NORMAL) + temp_tc = vec4((VS_BIT_NORM_REVERSE ? -normal : normal), 1.0); + else + temp_tc = vec4(0.0, 0.0, 1.0, 1.0); + break; + } + // Transform by texture matrix. XYZ as we are doing projection mapping. + v_texcoord = (temp_tc * base.tex_mtx).xyz * vec3(base.uvscaleoffset.xy, 1.0); + } + break; + + case GE_TEXMAP_ENVIRONMENT_MAP: // Shade mapping - use dots from light sources. + v_texcoord = vec3(base.uvscaleoffset.xy * vec2(1.0 + dot(normalize(light.pos[VS_BIT_LS0]), worldnormal), + 1.0 + dot(normalize(light.pos[VS_BIT_LS1]), worldnormal)) * 0.5, 1.0); + break; + default: + // ILLEGAL + break; + } + } + // Compute fogdepth + v_fogdepth = (viewPos.z + base.fogcoef.x) * base.fogcoef.y; // if (VS_BIT_ENABLE_FOG) + } + else + { + // Simple pass-through of vertex data to fragment shader + if (VS_BIT_DO_TEXTURE) + { + if (VS_BIT_DO_TEXTURE_TRANSFORM && !VS_BIT_IS_THROUGH) + v_texcoord = texcoord; + else + v_texcoord = vec3(texcoord.xy, 1.0); + } + if (VS_BIT_HAS_COLOR) + { + v_color0 = color0; + if (VS_BIT_LMODE) + v_color1 = color1; + } + else + v_color0 = base.matambientalpha; + + v_fogdepth = position.w; // if (VS_BIT_ENABLE_FOG) + + if (VS_BIT_IS_THROUGH) + gl_Position = base.proj_through_mtx * vec4(position.xyz, 1.0); + else + gl_Position = base.proj_mtx * vec4(position.xyz, 1.0); + } + if (GPU_ROUND_DEPTH_TO_16BIT) + { + gl_Position.z /= gl_Position.w; + gl_Position.z = gl_Position.z * base.depthRange.x + base.depthRange.y; + gl_Position.z = floor(gl_Position.z); + gl_Position.z = (gl_Position.z - base.depthRange.z) * base.depthRange.w; + gl_Position.z *= gl_Position.w; + } +} + diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 9933fac43e9d..efdb24de16e2 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -1458,7 +1458,7 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1) DrawingCoords p = TransformUnit::ScreenToDrawing(pprime); if ((z & 0xFF) == (z >> 8)) { - u16 *row = &depthbuf.as16[p.x + p.y * stride]; + void *row = &depthbuf.as16[p.x + p.y * stride]; memset(row, z, w * 2); } else { for (int x = 0; x < w; ++x) { @@ -1512,8 +1512,8 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1) if (gstate.FrameBufFormat() == GE_FORMAT_8888) { for (pprime.y = minY; pprime.y < maxY; pprime.y += 16) { DrawingCoords p = TransformUnit::ScreenToDrawing(pprime); - if ((new_color & 0xFF) == (new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16)) { - u32 *row = &fb.as32[p.x + p.y * stride]; + if ((new_color & 0xFF) == (u8)(new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16)) { + void *row = &fb.as32[p.x + p.y * stride]; memset(row, new_color, w * 4); } else { for (int x = 0; x < w; ++x) { @@ -1525,7 +1525,7 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1) for (pprime.y = minY; pprime.y < maxY; pprime.y += 16) { DrawingCoords p = TransformUnit::ScreenToDrawing(pprime); if ((new_color16 & 0xFF) == (new_color16 >> 8)) { - u16 *row = &fb.as16[p.x + p.y * stride]; + void *row = &fb.as16[p.x + p.y * stride]; memset(row, new_color16, w * 2); } else { for (int x = 0; x < w; ++x) { diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp index b6697b52c7ee..d3a1965d2244 100644 --- a/GPU/Software/RasterizerRectangle.cpp +++ b/GPU/Software/RasterizerRectangle.cpp @@ -31,7 +31,7 @@ extern bool currentDialogActive; namespace Rasterizer { // Through mode, with the specific Darkstalker settings. -inline void DrawSinglePixel5551(u16 *pixel, const Vec4 &color_in) { +inline void DrawSinglePixel5551(u16_le *pixel, const Vec4 &color_in) { u32 new_color; if (color_in.a() == 255) { new_color = color_in.ToRGBA() & 0xFFFFFF; @@ -143,7 +143,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) { int t = t_start; for (int y = pos0.y; y < pos1.y; y++) { int s = s_start; - u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); + u16_le *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); if (isWhite) { for (int x = pos0.x; x < pos1.x; x++) { u32 tex_color = nearestFunc(s, t, texptr, texbufw, 0); @@ -205,7 +205,7 @@ void DrawSprite(const VertexData& v0, const VertexData& v1) { return; for (int y = pos0.y; y < pos1.y; y++) { - u16 *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); + u16_le *pixel = fb.Get16Ptr(pos0.x, y, gstate.FrameBufStride()); for (int x = pos0.x; x < pos1.x; x++) { Vec4 prim_color = v0.color0; DrawSinglePixel5551(pixel, prim_color); diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index 0442cde30d93..abd821c95e00 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -277,16 +277,16 @@ static inline u32 LookupColor(unsigned int index, unsigned int level) switch (gstate.getClutPaletteFormat()) { case GE_CMODE_16BIT_BGR5650: - return RGB565ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); + return RGB565ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); case GE_CMODE_16BIT_ABGR5551: - return RGBA5551ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); + return RGBA5551ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); case GE_CMODE_16BIT_ABGR4444: - return RGBA4444ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); + return RGBA4444ToRGBA8888(reinterpret_cast(clut)[index + clutSharingOffset]); case GE_CMODE_32BIT_ABGR8888: - return clut[index + clutSharingOffset]; + return reinterpret_cast(clut)[index + clutSharingOffset]; default: ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", gstate.getClutPaletteFormat()); @@ -319,35 +319,35 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_4444: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]); - res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src); + res.v[i] = RGBA4444ToRGBA8888(*(const u16_le *)src); } return res; case GE_TFMT_5551: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]); - res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src); + res.v[i] = RGBA5551ToRGBA8888(*(const u16_le *)src); } return res; case GE_TFMT_5650: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]); - res.v[i] = RGB565ToRGBA8888(*(const u16 *)src); + res.v[i] = RGB565ToRGBA8888(*(const u16_le *)src); } return res; case GE_TFMT_8888: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]); - res.v[i] = *(const u32 *)src; + res.v[i] = *(const u32_le *)src; } return res; case GE_TFMT_CLUT32: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]); - u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24); + u32 val = *(u32_le *)src; res.v[i] = LookupColor(gstate.transformClutIndex(val), 0); } return res; @@ -355,7 +355,7 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_CLUT16: for (int i = 0; i < N; ++i) { const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]); - u16 val = src[0] + (src[1] << 8); + u16 val = *(u16_le *)src; res.v[i] = LookupColor(gstate.transformClutIndex(val), 0); } return res; @@ -380,7 +380,7 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_DXT1: for (int i = 0; i < N; ++i) { const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; + u32_le data[4 * 4]; DecodeDXT1Block(data, block, 4, 4, false); res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; } @@ -389,7 +389,7 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_DXT3: for (int i = 0; i < N; ++i) { const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; + u32_le data[4 * 4]; DecodeDXT3Block(data, block, 4, 4); res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; } @@ -398,7 +398,7 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t case GE_TFMT_DXT5: for (int i = 0; i < N; ++i) { const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] / 4) * (texbufw / 4) + (u[i] / 4); - u32 data[4 * 4]; + u32_le data[4 * 4]; DecodeDXT5Block(data, block, 4, 4); res.v[i] = data[4 * (v[i] % 4) + (u[i] % 4)]; } diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index d0d49548b90c..091ea56614e6 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -140,25 +140,27 @@ void SoftGPU::ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, in FormatBuffer displayBuffer; displayBuffer.data = overrideData ? overrideData : Memory::GetPointer(displayFramebuf_); for (int y = 0; y < srcheight; ++y) { - u32 *buf_line = &fbTexBuffer_[y * srcwidth]; - const u16 *fb_line = &displayBuffer.as16[y * displayStride_]; + u32_le *buf_line = &fbTexBuffer_[y * srcwidth]; + for (u32 x = 0; x < srcwidth; x++) { + u16 col = displayBuffer.Get16(x,y,displayStride_); - switch (displayFormat_) { - case GE_FORMAT_565: - ConvertRGB565ToRGBA8888(buf_line, fb_line, srcwidth); - break; + switch (displayFormat_) { + case GE_FORMAT_565: + buf_line[x] = Convert5To8((col) & 0x1f) | Convert6To8((col >> 5) & 0x3f) << 8 | Convert5To8((col >> 11) & 0x1f) << 16 | 255 << 24; + break; - case GE_FORMAT_5551: - ConvertRGBA5551ToRGBA8888(buf_line, fb_line, srcwidth); - break; + case GE_FORMAT_5551: + buf_line[x] = Convert5To8((col) & 0x1f) | Convert5To8((col >> 5) & 0x1f) << 8 | Convert5To8((col >> 10) & 0x1f) << 16 | (col >> 15) ? 255 << 24: 0; + break; - case GE_FORMAT_4444: - ConvertRGBA4444ToRGBA8888(buf_line, fb_line, srcwidth); - break; + case GE_FORMAT_4444: + buf_line[x] = Convert4To8(col & 0xf) | Convert4To8((col >> 4) & 0xf) << 8 | Convert4To8((col >> 8) & 0xf) << 16 | Convert4To8(col >> 12) << 24; + break; - default: - ERROR_LOG_REPORT(G3D, "Software: Unexpected framebuffer format: %d", displayFormat_); - break; + default: + ERROR_LOG_REPORT(G3D, "Software: Unexpected framebuffer format: %d", displayFormat_); + break; + } } } @@ -272,6 +274,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) { break; case GPUBackend::DIRECT3D9: case GPUBackend::DIRECT3D11: + case GPUBackend::GX2: outputFlags |= OutputFlags::POSITION_FLIPPED; break; case GPUBackend::VULKAN: @@ -953,14 +956,20 @@ bool SoftGPU::GetCurrentFramebuffer(GPUDebugBuffer &buffer, GPUDebugFramebufferT buffer.Allocate(x2 - x1, y2 - y1, fmt); - const int depth = fmt == GE_FORMAT_8888 ? 4 : 2; - const u8 *src = fb.data + stride * depth * y1; - u8 *dst = buffer.GetData(); - const int byteWidth = (x2 - x1) * depth; - for (int y = y1; y < y2; ++y) { - memcpy(dst, src + x1, byteWidth); - dst += byteWidth; - src += stride * depth; + if(fmt == GE_FORMAT_8888) { + u32_le *dst = (u32_le *)buffer.GetData(); + for (int y = y1; y < y2; ++y) { + for (int x = x1; x < x2; ++x) { + *dst++ = fb.Get32(x, y, stride); + } + } + } else { + u16_le *dst = (u16_le *)buffer.GetData(); + for (int y = y1; y < y2; ++y) { + for (int x = x1; x < x2; ++x) { + *dst++ = fb.Get16(x, y, stride); + } + } } return true; } diff --git a/GPU/Software/SoftGpu.h b/GPU/Software/SoftGpu.h index fc9bf68b7ebe..ad1b1ecbe5e1 100644 --- a/GPU/Software/SoftGpu.h +++ b/GPU/Software/SoftGpu.h @@ -25,8 +25,8 @@ struct FormatBuffer { FormatBuffer() { data = nullptr; } union { u8 *data; - u16 *as16; - u32 *as32; + u16_le *as16; + u32_le *as32; }; inline void Set16(int x, int y, int stride, u16 v) { @@ -45,7 +45,7 @@ struct FormatBuffer { return as32[x + y * stride]; } - inline u16 *Get16Ptr(int x, int y, int stride) { + inline u16_le *Get16Ptr(int x, int y, int stride) { return &as16[x + y * stride]; } }; @@ -114,7 +114,7 @@ class SoftGPU : public GPUCommon { SoftwareDrawEngine *drawEngine_ = nullptr; Draw::Texture *fbTex = nullptr; - std::vector fbTexBuffer_; + std::vector fbTexBuffer_; }; // TODO: These shouldn't be global. diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index a30f7f875faa..f7582b69d006 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -565,8 +565,8 @@ bool TransformUnit::GetCurrentSimpleVertices(int count, std::vector 0 && (gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) { const u8 *inds = Memory::GetPointer(gstate_c.indexAddr); - const u16 *inds16 = (const u16 *)inds; - const u32 *inds32 = (const u32 *)inds; + const u16_le *inds16 = (const u16_le *)inds; + const u32_le *inds32 = (const u32_le *)inds; if (inds) { GetIndexBounds(inds, count, gstate.vertType, &indexLowerBound, &indexUpperBound); diff --git a/GPU/Vulkan/DepalettizeShaderVulkan.cpp b/GPU/Vulkan/DepalettizeShaderVulkan.cpp index f0ab47eb1c58..5a48d8da91ec 100644 --- a/GPU/Vulkan/DepalettizeShaderVulkan.cpp +++ b/GPU/Vulkan/DepalettizeShaderVulkan.cpp @@ -123,7 +123,7 @@ DepalShaderVulkan *DepalShaderCacheVulkan::GetDepalettizeShader(uint32_t clutMod return depal; } -VulkanTexture *DepalShaderCacheVulkan::GetClutTexture(GEPaletteFormat clutFormat, u32 clutHash, u32 *rawClut, bool expandTo32bit) { +VulkanTexture *DepalShaderCacheVulkan::GetClutTexture(GEPaletteFormat clutFormat, u32 clutHash, u32_le *rawClut, bool expandTo32bit) { u32 clutId = GetClutID(clutFormat, clutHash); auto oldtex = texCache_.find(clutId); if (oldtex != texCache_.end()) { @@ -138,18 +138,18 @@ VulkanTexture *DepalShaderCacheVulkan::GetClutTexture(GEPaletteFormat clutFormat int texturePixels = clutFormat == GE_CMODE_32BIT_ABGR8888 ? 256 : 512; int bpp = clutFormat == GE_CMODE_32BIT_ABGR8888 ? 4 : 2; VkFormat dstFmt; - uint32_t *expanded = nullptr; + u32_le *expanded = nullptr; if (expandTo32bit && clutFormat != GE_CMODE_32BIT_ABGR8888) { - expanded = new uint32_t[texturePixels]; + expanded = new u32_le[texturePixels]; switch (clutFormat) { case GE_CMODE_16BIT_ABGR4444: - ConvertRGBA4444ToRGBA8888(expanded, (const uint16_t *)rawClut, texturePixels); + ConvertRGBA4444ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); break; case GE_CMODE_16BIT_ABGR5551: - ConvertRGBA5551ToRGBA8888(expanded, (const uint16_t *)rawClut, texturePixels); + ConvertRGBA5551ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); break; case GE_CMODE_16BIT_BGR5650: - ConvertRGB565ToRGBA8888(expanded, (const uint16_t *)rawClut, texturePixels); + ConvertRGB565ToRGBA8888(expanded, (const u16_le *)rawClut, texturePixels); break; default: break; diff --git a/GPU/Vulkan/DepalettizeShaderVulkan.h b/GPU/Vulkan/DepalettizeShaderVulkan.h index ec01a17b9648..543a4f53a4e6 100644 --- a/GPU/Vulkan/DepalettizeShaderVulkan.h +++ b/GPU/Vulkan/DepalettizeShaderVulkan.h @@ -57,7 +57,7 @@ class DepalShaderCacheVulkan : public DepalShaderCacheCommon { // This also uploads the palette and binds the correct texture. DepalShaderVulkan *GetDepalettizeShader(uint32_t clutMode, GEBufferFormat pixelFormat); - VulkanTexture *GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32 *rawClut, bool expandTo32bit); + VulkanTexture *GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32_le *rawClut, bool expandTo32bit); void Clear(); void Decimate(); diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index 93e39d87fd45..049e2bc48ae3 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -1067,7 +1067,7 @@ VkFormat TextureCacheVulkan::GetDestFormat(GETextureFormat format, GEPaletteForm } } -TexCacheEntry::TexStatus TextureCacheVulkan::CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int stride, int w, int h) { +TexCacheEntry::TexStatus TextureCacheVulkan::CheckAlpha(const u32_le *pixelData, VkFormat dstFmt, int stride, int w, int h) { CheckAlphaResult res; switch (dstFmt) { case VULKAN_4444_FORMAT: @@ -1119,7 +1119,7 @@ void TextureCacheVulkan::LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePt if ((entry.status & TexCacheEntry::STATUS_CHANGE_FREQUENT) == 0) { // TODO: When we decode directly, this can be more expensive (maybe not on mobile?) // This does allow us to skip alpha testing, though. - TexCacheEntry::TexStatus alphaStatus = CheckAlpha(pixelData, dstFmt, decPitch / bpp, w, h); + TexCacheEntry::TexStatus alphaStatus = CheckAlpha((u32_le*)pixelData, dstFmt, decPitch / bpp, w, h); entry.SetAlphaStatus(alphaStatus, level); } else { entry.SetAlphaStatus(TexCacheEntry::STATUS_ALPHA_UNKNOWN); diff --git a/GPU/Vulkan/TextureCacheVulkan.h b/GPU/Vulkan/TextureCacheVulkan.h index a9163f2e1869..66f60ad66012 100644 --- a/GPU/Vulkan/TextureCacheVulkan.h +++ b/GPU/Vulkan/TextureCacheVulkan.h @@ -119,7 +119,7 @@ class TextureCacheVulkan : public TextureCacheCommon { private: void LoadTextureLevel(TexCacheEntry &entry, uint8_t *writePtr, int rowPitch, int level, int scaleFactor, VkFormat dstFmt); VkFormat GetDestFormat(GETextureFormat format, GEPaletteFormat clutFormat) const; - TexCacheEntry::TexStatus CheckAlpha(const u32 *pixelData, VkFormat dstFmt, int stride, int w, int h); + TexCacheEntry::TexStatus CheckAlpha(const u32_le *pixelData, VkFormat dstFmt, int stride, int w, int h); void UpdateCurrentClut(GEPaletteFormat clutFormat, u32 clutBase, bool clutIndexIsSimple) override; void ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) override; diff --git a/GPU/Vulkan/TextureScalerVulkan.cpp b/GPU/Vulkan/TextureScalerVulkan.cpp index 27f43bcdaff9..a7598bf15ca2 100644 --- a/GPU/Vulkan/TextureScalerVulkan.cpp +++ b/GPU/Vulkan/TextureScalerVulkan.cpp @@ -51,15 +51,15 @@ void TextureScalerVulkan::ConvertTo8888(u32 format, u32* source, u32* &dest, int break; case VULKAN_4444_FORMAT: - GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert4444_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case VULKAN_565_FORMAT: - GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert565_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; case VULKAN_1555_FORMAT: - GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); + GlobalThreadPool::Loop(std::bind(&convert5551_dx9, (u16_le*)source, dest, width, std::placeholders::_1, std::placeholders::_2), 0, height); break; default: diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp index ba3cd459763e..6e65c879f297 100644 --- a/UI/EmuScreen.cpp +++ b/UI/EmuScreen.cpp @@ -220,6 +220,9 @@ void EmuScreen::bootGame(const std::string &filename) { CoreParameter coreParam{}; coreParam.cpuCore = (CPUCore)g_Config.iCpuCore; +#if PPSSPP_PLATFORM(WIIU) + coreParam.gpuCore = GPUCORE_GX2; +#else coreParam.gpuCore = GPUCORE_GLES; switch (GetGPUBackend()) { case GPUBackend::DIRECT3D11: @@ -239,6 +242,7 @@ void EmuScreen::bootGame(const std::string &filename) { break; #endif } +#endif // Preserve the existing graphics context. coreParam.graphicsContext = PSP_CoreParameter().graphicsContext; @@ -577,7 +581,7 @@ void EmuScreen::onVKeyDown(int virtualKeyCode) { if (g_Config.bDumpFrames == g_Config.bDumpAudio) { g_Config.bDumpFrames = !g_Config.bDumpFrames; g_Config.bDumpAudio = !g_Config.bDumpAudio; - } else { + } else { // This hotkey should always toggle both audio and video together. // So let's make sure that's the only outcome even if video OR audio was already being dumped. if (g_Config.bDumpFrames) { @@ -1567,7 +1571,7 @@ void EmuScreen::renderUI() { DrawFrameTimes(ctx, ctx->GetLayoutBounds()); } -#if !PPSSPP_PLATFORM(UWP) +#if !PPSSPP_PLATFORM(UWP) && !defined(NO_VULKAN) if (g_Config.iGPUBackend == (int)GPUBackend::VULKAN && g_Config.bShowAllocatorDebug) { DrawAllocatorVis(ctx, gpu); } diff --git a/UI/GameSettingsScreen.cpp b/UI/GameSettingsScreen.cpp index 0e248c29811b..d345d89e9b0f 100644 --- a/UI/GameSettingsScreen.cpp +++ b/UI/GameSettingsScreen.cpp @@ -217,7 +217,7 @@ void GameSettingsScreen::CreateViews() { graphicsSettings->Add(new ItemHeader(gr->T("Rendering Mode"))); -#if !PPSSPP_PLATFORM(UWP) +#if !PPSSPP_PLATFORM(UWP) && !PPSSPP_PLATFORM(WIIU) static const char *renderingBackend[] = { "OpenGL", "Direct3D 9", "Direct3D 11", "Vulkan" }; PopupMultiChoice *renderingBackendChoice = graphicsSettings->Add(new PopupMultiChoice(&g_Config.iGPUBackend, gr->T("Backend"), renderingBackend, (int)GPUBackend::OPENGL, ARRAY_SIZE(renderingBackend), gr->GetName(), screenManager())); renderingBackendChoice->OnChoice.Handle(this, &GameSettingsScreen::OnRenderingBackend); diff --git a/UI/MainScreen.cpp b/UI/MainScreen.cpp index ca03869cd812..3f4102f82416 100644 --- a/UI/MainScreen.cpp +++ b/UI/MainScreen.cpp @@ -553,6 +553,8 @@ UI::EventReturn GameBrowser::HomeClick(UI::EventParams &e) { #elif PPSSPP_PLATFORM(UWP) // TODO UWP SetPath(g_Config.memStickDirectory); +#elif defined(__wiiu__) + path_.SetPath("/"); #else SetPath(getenv("HOME")); #endif diff --git a/UI/NativeApp.cpp b/UI/NativeApp.cpp index 047111665019..a9cd79f730d4 100644 --- a/UI/NativeApp.cpp +++ b/UI/NativeApp.cpp @@ -327,7 +327,7 @@ static void PostLoadConfig() { if (g_Config.currentDirectory.empty()) { #if defined(__ANDROID__) g_Config.currentDirectory = g_Config.externalDirectory; -#elif defined(IOS) +#elif defined(IOS) || defined(__wiiu__) g_Config.currentDirectory = g_Config.internalDataDirectory; #elif PPSSPP_PLATFORM(SWITCH) g_Config.currentDirectory = "/"; @@ -509,6 +509,9 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch #elif PPSSPP_PLATFORM(SWITCH) g_Config.memStickDirectory = g_Config.internalDataDirectory + "config/ppsspp/"; g_Config.flash0Directory = g_Config.internalDataDirectory + "assets/flash0/"; +#elif defined(__wiiu__) + g_Config.memStickDirectory = "sd:/ppsspp/"; + g_Config.flash0Directory = "sd:/ppsspp/assets/flash0/"; #elif !defined(_WIN32) std::string config; if (getenv("XDG_CONFIG_HOME") != NULL) @@ -1066,6 +1069,7 @@ void NativeRender(GraphicsContext *graphicsContext) { translation.setTranslation(Vec3(-0.5f * g_dpi_scale_x / g_dpi_scale_real_x, -0.5f * g_dpi_scale_y / g_dpi_scale_real_y, 0.0f)); ortho = translation * ortho; break; + case GPUBackend::GX2: case GPUBackend::DIRECT3D11: ortho.setOrthoD3D(0.0f, xres, yres, 0.0f, -1.0f, 1.0f); break; diff --git a/WiiU/GX2GraphicsContext.cpp b/WiiU/GX2GraphicsContext.cpp new file mode 100644 index 000000000000..d569c55163b3 --- /dev/null +++ b/WiiU/GX2GraphicsContext.cpp @@ -0,0 +1,148 @@ + +#define GX2_COMP_SEL +#include "WiiU/GX2GraphicsContext.h" +#include "thin3d/thin3d.h" +#include "thin3d/thin3d_create.h" +#include "Core/System.h" +#include "base/NativeApp.h" +#include "input/input_state.h" + +#include +#include +#include + +static bool swap_is_pending(void *start_time) { + uint32_t swap_count, flip_count; + OSTime last_flip, last_vsync; + + GX2GetSwapStatus(&swap_count, &flip_count, &last_flip, &last_vsync); + + return last_vsync < *(OSTime *)start_time; +} + +bool GX2GraphicsContext::Init() { + static const RenderMode render_mode_map[] = { + { 0 }, /* GX2_TV_SCAN_MODE_NONE */ + { 854, 480, GX2_TV_RENDER_MODE_WIDE_480P }, /* GX2_TV_SCAN_MODE_576I */ + { 854, 480, GX2_TV_RENDER_MODE_WIDE_480P }, /* GX2_TV_SCAN_MODE_480I */ + { 854, 480, GX2_TV_RENDER_MODE_WIDE_480P }, /* GX2_TV_SCAN_MODE_480P */ + { 1280, 720, GX2_TV_RENDER_MODE_WIDE_720P }, /* GX2_TV_SCAN_MODE_720P */ + { 0 }, /* GX2_TV_SCAN_MODE_unk */ + { 1920, 1080, GX2_TV_RENDER_MODE_WIDE_1080P }, /* GX2_TV_SCAN_MODE_1080I */ + { 1920, 1080, GX2_TV_RENDER_MODE_WIDE_1080P } /* GX2_TV_SCAN_MODE_1080P */ + }; + render_mode_ = render_mode_map[GX2GetSystemTVScanMode()]; + render_mode_ = render_mode_map[GX2_TV_SCAN_MODE_480P]; + + cmd_buffer_ = MEM2_alloc(0x400000, 0x40); + u32 init_attributes[] = { GX2_INIT_CMD_BUF_BASE, (u32)cmd_buffer_, GX2_INIT_CMD_BUF_POOL_SIZE, 0x400000, GX2_INIT_ARGC, 0, GX2_INIT_ARGV, 0, GX2_INIT_END }; + GX2Init(init_attributes); + u32 size = 0; + u32 tmp = 0; + GX2CalcTVSize(render_mode_.mode, GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8, GX2_BUFFERING_MODE_DOUBLE, &size, &tmp); + + tv_scan_buffer_ = MEMBucket_alloc(size, GX2_SCAN_BUFFER_ALIGNMENT); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU, tv_scan_buffer_, size); + GX2SetTVBuffer(tv_scan_buffer_, size, render_mode_.mode, GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8, GX2_BUFFERING_MODE_DOUBLE); + + GX2CalcDRCSize(GX2_DRC_RENDER_MODE_SINGLE, GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8, GX2_BUFFERING_MODE_DOUBLE, &size, &tmp); + + drc_scan_buffer_ = MEMBucket_alloc(size, GX2_SCAN_BUFFER_ALIGNMENT); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU, drc_scan_buffer_, size); + GX2SetDRCBuffer(drc_scan_buffer_, size, GX2_DRC_RENDER_MODE_SINGLE, GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8, GX2_BUFFERING_MODE_DOUBLE); + + color_buffer_.surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + color_buffer_.surface.width = render_mode_.width; + color_buffer_.surface.height = render_mode_.height; + color_buffer_.surface.depth = 1; + color_buffer_.surface.mipLevels = 1; + color_buffer_.surface.format = GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + color_buffer_.surface.use = GX2_SURFACE_USE_TEXTURE_COLOR_BUFFER_TV; + color_buffer_.viewNumSlices = 1; + + GX2CalcSurfaceSizeAndAlignment(&color_buffer_.surface); + GX2InitColorBufferRegs(&color_buffer_); + + color_buffer_.surface.image = MEM1_alloc(color_buffer_.surface.imageSize, color_buffer_.surface.alignment); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU, color_buffer_.surface.image, color_buffer_.surface.imageSize); + + depth_buffer_.surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + depth_buffer_.surface.width = render_mode_.width; + depth_buffer_.surface.height = render_mode_.height; + depth_buffer_.surface.depth = 1; + depth_buffer_.surface.mipLevels = 1; + depth_buffer_.surface.format = GX2_SURFACE_FORMAT_FLOAT_D24_S8; + depth_buffer_.surface.use = GX2_SURFACE_USE_DEPTH_BUFFER; + depth_buffer_.viewNumSlices = 1; + + GX2CalcSurfaceSizeAndAlignment(&depth_buffer_.surface); + GX2InitDepthBufferRegs(&depth_buffer_); + + depth_buffer_.surface.image = MEM1_alloc(depth_buffer_.surface.imageSize, depth_buffer_.surface.alignment); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU, depth_buffer_.surface.image, depth_buffer_.surface.imageSize); + + ctx_state_ = (GX2ContextState *)MEM2_alloc(sizeof(GX2ContextState), GX2_CONTEXT_STATE_ALIGNMENT); + GX2SetupContextStateEx(ctx_state_, GX2_TRUE); + + GX2SetContextState(ctx_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); + GX2SetColorBuffer(&color_buffer_, GX2_RENDER_TARGET_0); + GX2SetDepthBuffer(&depth_buffer_); + GX2SetViewport(0.0f, 0.0f, color_buffer_.surface.width, color_buffer_.surface.height, 0.0f, 1.0f); + GX2SetScissor(0, 0, color_buffer_.surface.width, color_buffer_.surface.height); + GX2SetDepthOnlyControl(GX2_DISABLE, GX2_DISABLE, GX2_COMPARE_FUNC_ALWAYS); + GX2SetColorControl(GX2_LOGIC_OP_COPY, 0xFF, GX2_DISABLE, GX2_ENABLE); + GX2SetBlendControl(GX2_RENDER_TARGET_0, GX2_BLEND_MODE_SRC_ALPHA, GX2_BLEND_MODE_INV_SRC_ALPHA, GX2_BLEND_COMBINE_MODE_ADD, GX2_ENABLE, GX2_BLEND_MODE_SRC_ALPHA, GX2_BLEND_MODE_INV_SRC_ALPHA, GX2_BLEND_COMBINE_MODE_ADD); + GX2SetCullOnlyControl(GX2_FRONT_FACE_CCW, GX2_DISABLE, GX2_DISABLE); + + GX2ClearColor(&color_buffer_, 0.0f, 0.0f, 0.0f, 1.0f); + SwapBuffers(); + + GX2SetTVEnable(GX2_ENABLE); + GX2SetDRCEnable(GX2_ENABLE); + + draw_ = Draw::T3DCreateGX2Context(ctx_state_, &color_buffer_, &depth_buffer_); + SetGPUBackend(GPUBackend::GX2); + GX2SetSwapInterval(0); + return draw_->CreatePresets(); +} + +void GX2GraphicsContext::Shutdown() { + if (!draw_) + return; + delete draw_; + draw_ = nullptr; + GX2ClearColor(&color_buffer_, 0.0f, 0.0f, 0.0f, 1.0f); + SwapBuffers(); + GX2DrawDone(); + GX2Shutdown(); + + GX2SetTVEnable(GX2_DISABLE); + GX2SetDRCEnable(GX2_DISABLE); + + MEM2_free(ctx_state_); + ctx_state_ = nullptr; + MEM2_free(cmd_buffer_); + cmd_buffer_ = nullptr; + MEM1_free(color_buffer_.surface.image); + color_buffer_ = {}; + MEM1_free(depth_buffer_.surface.image); + depth_buffer_ = {}; + MEMBucket_free(tv_scan_buffer_); + tv_scan_buffer_ = nullptr; + MEMBucket_free(drc_scan_buffer_); + drc_scan_buffer_ = nullptr; +} +#include "profiler/profiler.h" +void GX2GraphicsContext::SwapBuffers() { + PROFILE_THIS_SCOPE("swap"); + GX2DrawDone(); + GX2CopyColorBufferToScanBuffer(&color_buffer_, GX2_SCAN_TARGET_DRC); + GX2CopyColorBufferToScanBuffer(&color_buffer_, GX2_SCAN_TARGET_TV); + GX2SwapScanBuffers(); + GX2Flush(); +// GX2WaitForVsync(); + GX2WaitForFlip(); + GX2SetContextState(ctx_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); +} diff --git a/WiiU/GX2GraphicsContext.h b/WiiU/GX2GraphicsContext.h new file mode 100644 index 000000000000..7063106897ae --- /dev/null +++ b/WiiU/GX2GraphicsContext.h @@ -0,0 +1,33 @@ +#pragma once + +#include "Common/GraphicsContext.h" +#include + +class GX2GraphicsContext : public GraphicsContext { +public: + GX2GraphicsContext() {} + + bool Init(); + + void Shutdown() override; + void SwapBuffers() override; + virtual void SwapInterval(int interval) override { GX2SetSwapInterval(interval); } + virtual void Resize() override {} + + Draw::DrawContext *GetDrawContext() override { return draw_; } + +private: + typedef struct { + int width; + int height; + GX2TVRenderMode mode; + } RenderMode; + Draw::DrawContext *draw_ = nullptr; + void *cmd_buffer_; + RenderMode render_mode_; + void *drc_scan_buffer_; + void *tv_scan_buffer_; + GX2ColorBuffer color_buffer_ = {}; + GX2DepthBuffer depth_buffer_ = {}; + GX2ContextState *ctx_state_; +}; diff --git a/WiiU/README.TXT b/WiiU/README.TXT new file mode 100644 index 000000000000..2fca759db99a --- /dev/null +++ b/WiiU/README.TXT @@ -0,0 +1,15 @@ + +### Build Instructions +tested with devkitPPC r38 + +make sure you have the latest ppsspp-ffmpeg submodule: +cd ffmpeg; git pull origin master + +then create a build directory, then navigate to it and run: +cmake -DWIIU=ON path/to/ppsspp +make + +if you need console output over network, set the PC_DEVELOPMENT_IP_ADDRESS define in ext/wiiu/logger.c and you can use the ext/wiiu/net_listen.sh script to listen to the log. +the run.sh script can be used to both send the compiled rpx to the HBL and listen to console output. + + diff --git a/WiiU/WiiUHost.cpp b/WiiU/WiiUHost.cpp new file mode 100644 index 000000000000..7009552a06c3 --- /dev/null +++ b/WiiU/WiiUHost.cpp @@ -0,0 +1,242 @@ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Common/Log.h" +#include "UI/OnScreenDisplay.h" +#include "base/NativeApp.h" +#include "input/input_state.h" + +#include "WiiU/WiiUHost.h" +#include "WiiU/GX2GraphicsContext.h" + +static void VPADRangeToPSPRange(VPADVec2D *vec) { + if (vec->x == 0.0 || vec->y == 0.0) { + return; + } + float x = fabs(vec->x); + float y = fabs(vec->y); + float phi = atanf(y / x); + float scale = (x > y) ? 1.0 / cosf(phi) : 1.0 / sinf(phi); + vec->x *= scale; + vec->y *= scale; +} + +static void VPADCallback(s32 chan) { + static keycode_t keymap[] = { + NKCODE_UNKNOWN, // VPAD_BUTTON_SYNC_BIT = 0, + NKCODE_HOME, // VPAD_BUTTON_HOME_BIT = 1, + NKCODE_BUTTON_SELECT, // VPAD_BUTTON_MINUS_BIT = 2, + NKCODE_BUTTON_START, // VPAD_BUTTON_PLUS_BIT = 3, + NKCODE_BUTTON_R1, // VPAD_BUTTON_R_BIT = 4, + NKCODE_BUTTON_L1, // VPAD_BUTTON_L_BIT = 5, + NKCODE_BUTTON_R2, // VPAD_BUTTON_ZR_BIT = 6, + NKCODE_BUTTON_L2, // VPAD_BUTTON_ZL_BIT = 7, + NKCODE_DPAD_DOWN, // VPAD_BUTTON_DOWN_BIT = 8, + NKCODE_DPAD_UP, // VPAD_BUTTON_UP_BIT = 9, + NKCODE_DPAD_RIGHT, // VPAD_BUTTON_RIGHT_BIT = 10, + NKCODE_DPAD_LEFT, // VPAD_BUTTON_LEFT_BIT = 11, + NKCODE_BUTTON_Y, // VPAD_BUTTON_Y_BIT = 12, + NKCODE_BUTTON_X, // VPAD_BUTTON_X_BIT = 13, + NKCODE_BUTTON_B, // VPAD_BUTTON_B_BIT = 14, + NKCODE_BUTTON_A, // VPAD_BUTTON_A_BIT = 15, + NKCODE_UNKNOWN, // VPAD_BUTTON_TV_BIT = 16, + NKCODE_BUTTON_THUMBR, // VPAD_BUTTON_STICK_R_BIT = 17, + NKCODE_BUTTON_THUMBL, // VPAD_BUTTON_STICK_L_BIT = 18, + NKCODE_UNKNOWN, // VPAD_BUTTON_TOUCH_BIT = 19, + NKCODE_UNKNOWN, // VPAD_BUTTON_UNUSED1_BIT = 20, + NKCODE_UNKNOWN, // VPAD_BUTTON_UNUSED2_BIT = 21, + NKCODE_UNKNOWN, // VPAD_BUTTON_UNUSED3_BIT = 22, + NKCODE_UNKNOWN, // VPAD_STICK_R_EMULATION_DOWN_BIT = 23, + NKCODE_UNKNOWN, // VPAD_STICK_R_EMULATION_UP_BIT = 24, + NKCODE_UNKNOWN, // VPAD_STICK_R_EMULATION_RIGHT_BIT = 25, + NKCODE_UNKNOWN, // VPAD_STICK_R_EMULATION_LEFT_BIT = 26, + NKCODE_UNKNOWN, // VPAD_STICK_L_EMULATION_DOWN_BIT = 27, + NKCODE_UNKNOWN, // VPAD_STICK_L_EMULATION_UP_BIT = 28, + NKCODE_UNKNOWN, // VPAD_STICK_L_EMULATION_RIGHT_BIT = 29, + NKCODE_UNKNOWN, // VPAD_STICK_L_EMULATION_LEFT_BIT = 30, + }; + + VPADStatus vpad; + VPADReadError readError; + VPADRead(chan, &vpad, 1, &readError); + + if (!readError) { + static int touchflags; + if (vpad.tpFiltered1.validity != VPAD_VALID) { + vpad.tpFiltered1.touched = false; + } + if (touchflags == TOUCH_DOWN || touchflags == TOUCH_MOVE) { + touchflags = vpad.tpFiltered1.touched ? TOUCH_MOVE : TOUCH_UP; + } else { + touchflags = vpad.tpFiltered1.touched ? TOUCH_DOWN : 0; + } + if (touchflags) { + VPADTouchData calibrated; + VPADGetTPCalibratedPointEx(chan, VPAD_TOUCH_RESOLUTION_854x480, &calibrated, &vpad.tpFiltered1); + NativeTouch({ (float)calibrated.x, (float)calibrated.y, 0, touchflags }); + } + for (int i = 0; i < countof(keymap); i++) { + if (keymap[i] == NKCODE_UNKNOWN) + continue; + + if ((vpad.trigger | vpad.release) & (1 << i)) { + NativeKey({ DEVICE_ID_PAD_0 + chan, keymap[i], (vpad.trigger & (1 << i)) ? KEY_DOWN : KEY_UP }); + } + } + + static VPADVec2D prevLeftStick, prevRightStick; + if (prevLeftStick.x != vpad.leftStick.x || prevLeftStick.y != vpad.leftStick.y) { + prevLeftStick = vpad.leftStick; + VPADRangeToPSPRange(&vpad.leftStick); + NativeAxis({ DEVICE_ID_PAD_0 + chan, JOYSTICK_OUYA_AXIS_LS_X, vpad.leftStick.x }); + NativeAxis({ DEVICE_ID_PAD_0 + chan, JOYSTICK_OUYA_AXIS_LS_Y, vpad.leftStick.y }); + } + if (prevRightStick.x != vpad.rightStick.x || prevRightStick.y != vpad.rightStick.y) { + prevRightStick = vpad.rightStick; + VPADRangeToPSPRange(&vpad.rightStick); + NativeAxis({ DEVICE_ID_PAD_0 + chan, JOYSTICK_OUYA_AXIS_RS_X, vpad.rightStick.x }); + NativeAxis({ DEVICE_ID_PAD_0 + chan, JOYSTICK_OUYA_AXIS_RS_Y, vpad.rightStick.y }); + } +#if 1 + if (vpad.trigger & VPAD_BUTTON_ZL) { + System_SendMessage("finish", ""); + } + if (vpad.trigger & VPAD_BUTTON_STICK_L) { + extern bool g_TakeScreenshot; + g_TakeScreenshot = true; + } +#endif + } +} + +static void SaveCallback(void) { OSSavesDone_ReadyToRelease(); } + +WiiUHost::WiiUHost() { + ProcUIInit(&SaveCallback); + VPADInit(); + WPADEnableURCC(true); + WPADEnableWiiRemote(true); + KPADInit(); + VPADSetSamplingCallback(0, VPADCallback); + + chdir("sd:/ppsspp/"); // probably useless... +} + +WiiUHost::~WiiUHost() { + VPADSetSamplingCallback(0, nullptr); + ProcUIShutdown(); +} + +bool WiiUHost::InitGraphics(std::string *error_message, GraphicsContext **ctx) { + if (ctx_.Init()) { + *ctx = &ctx_; + return true; + } else { + *ctx = nullptr; + return false; + } +} + +void WiiUHost::ShutdownGraphics() { ctx_.Shutdown(); } + +#define AX_FRAMES 2 +static_assert(!(AX_FRAMES & (AX_FRAMES - 1)), "AX_FRAMES must be a power of two"); + +static AXMVoice *mvoice; +static s16 __attribute__((aligned(64))) axBuffers[2][AX_FRAMES][AX_FRAME_SIZE]; + +static void AXCallback() { + static s16 mixBuffer[AX_FRAME_SIZE * 2]; + static int pos; +#if 1 + AXVoiceOffsets offsets; + AXGetVoiceOffsets(mvoice->v[0], &offsets); + if ((offsets.currentOffset / AX_FRAME_SIZE) == pos) { + pos = ((offsets.currentOffset / AX_FRAME_SIZE) + (AX_FRAMES >> 1)) & (AX_FRAMES - 1); + } +#endif + + int count = NativeMix(mixBuffer, AX_FRAME_SIZE); + int extra = AX_FRAME_SIZE - count; + + const s16 *src = mixBuffer; + s16 *dst_l = axBuffers[0][pos]; + s16 *dst_r = axBuffers[1][pos]; + + while (count--) { + *dst_l++ = *src++; + *dst_r++ = *src++; + } + while (extra--) { + *dst_l++ = 0; + *dst_r++ = 0; + } + + DCStoreRangeNoSync(axBuffers[0][pos], AX_FRAME_SIZE * sizeof(s16)); + DCStoreRangeNoSync(axBuffers[1][pos], AX_FRAME_SIZE * sizeof(s16)); + + pos++; + pos &= AX_FRAMES - 1; +} + +void WiiUHost::InitSound() { + if (InitSoundRefCount_++) + return; + + AXInitParams initParams = { AX_INIT_RENDERER_48KHZ }; + AXInitWithParams(&initParams); + + AXMVoiceParams mVoiceParams = {}; + mVoiceParams.count = 2; + AXAcquireMultiVoice(31, NULL, 0, &mVoiceParams, &mvoice); + + if (mvoice && mvoice->channels == 2) { + AXVoiceOffsets offsets[2]; + offsets[0].currentOffset = AX_FRAME_SIZE; + offsets[0].loopOffset = 0; + offsets[0].endOffset = (AX_FRAMES * AX_FRAME_SIZE) - 1; + offsets[0].loopingEnabled = AX_VOICE_LOOP_ENABLED; + offsets[0].dataType = AX_VOICE_FORMAT_LPCM16; + offsets[0].data = axBuffers[0]; + + offsets[1] = offsets[0]; + offsets[1].data = axBuffers[1]; + AXSetMultiVoiceOffsets(mvoice, offsets); + + AXSetMultiVoiceSrcType(mvoice, AX_VOICE_SRC_TYPE_NONE); + AXSetMultiVoiceSrcRatio(mvoice, 1.0f); + AXVoiceVeData ve = { 0x8000, 0 }; + AXSetMultiVoiceVe(mvoice, &ve); + + AXSetMultiVoiceDeviceMix(mvoice, AX_DEVICE_TYPE_DRC, 0, 0, 0x8000, 0); + AXSetMultiVoiceDeviceMix(mvoice, AX_DEVICE_TYPE_TV, 0, 0, 0x8000, 0); + } + + AXRegisterFrameCallback(AXCallback); + AXSetMultiVoiceState(mvoice, AX_VOICE_STATE_PLAYING); +} + +void WiiUHost::ShutdownSound() { + if (--InitSoundRefCount_) + return; + + AXSetMultiVoiceState(mvoice, AX_VOICE_STATE_STOPPED); + AXRegisterFrameCallback(NULL); + + AXFreeMultiVoice(mvoice); + AXQuit(); +} + +void WiiUHost::NotifyUserMessage(const std::string &message, float duration, u32 color, const char *id) { osm.Show(message, duration, color, -1, true, id); } + +void WiiUHost::SendUIMessage(const std::string &message, const std::string &value) { NativeMessageReceived(message.c_str(), value.c_str()); } diff --git a/WiiU/WiiUHost.h b/WiiU/WiiUHost.h new file mode 100644 index 000000000000..33c1bcfb565a --- /dev/null +++ b/WiiU/WiiUHost.h @@ -0,0 +1,24 @@ +#pragma once + +#include "../Core/Host.h" +#include "WiiU/GX2GraphicsContext.h" + +class GraphicsContext; + +class WiiUHost : public Host { +public: + WiiUHost(); + ~WiiUHost(); + + bool InitGraphics(std::string *error_message, GraphicsContext **ctx) override; + void ShutdownGraphics() override; + + void InitSound() override; + void ShutdownSound() override; + + void NotifyUserMessage(const std::string &message, float duration = 1.0f, u32 color = 0x00FFFFFF, const char *id = nullptr) override; + void SendUIMessage(const std::string &message, const std::string &value) override; +private: + GX2GraphicsContext ctx_; + int InitSoundRefCount_ = 0; +}; diff --git a/WiiU/WiiUMain.cpp b/WiiU/WiiUMain.cpp new file mode 100644 index 000000000000..2dba30516f2b --- /dev/null +++ b/WiiU/WiiUMain.cpp @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "profiler/profiler.h" +#include "base/NativeApp.h" +#include "base/display.h" +#include "Core/Core.h" +#include "Common/Log.h" + +#include "Common/GraphicsContext.h" +#include "WiiU/WiiUHost.h" + +const char *PROGRAM_NAME = "PPSSPP"; +const char *PROGRAM_VERSION = PPSSPP_GIT_VERSION; + +static int g_QuitRequested; +void System_SendMessage(const char *command, const char *parameter) { + if (!strcmp(command, "finish")) { + g_QuitRequested = true; + UpdateUIState(UISTATE_EXIT); + Core_Stop(); + } +} + +int main(int argc, char **argv) { + PROFILE_INIT(); + PPCSetFpIEEEMode(); + + host = new WiiUHost(); + + std::string app_name; + std::string app_name_nice; + std::string version; + bool landscape; + NativeGetAppInfo(&app_name, &app_name_nice, &landscape, &version); + + const char *argv_[] = { + "sd:/ppsspp/PPSSPP.rpx", +// "-d", +// "-v", +// "sd:/cube.elf", + nullptr + }; + NativeInit(sizeof(argv_) / sizeof(*argv_) - 1, argv_, "sd:/ppsspp/", "sd:/ppsspp/", nullptr); +#if 0 + UpdateScreenScale(854,480); +#else + float dpi_scale = 1.0f; + g_dpi = 96.0f; + pixel_xres = 854; + pixel_yres = 480; + dp_xres = (float)pixel_xres * dpi_scale; + dp_yres = (float)pixel_yres * dpi_scale; + pixel_in_dps_x = (float)pixel_xres / dp_xres; + pixel_in_dps_y = (float)pixel_yres / dp_yres; + g_dpi_scale_x = dp_xres / (float)pixel_xres; + g_dpi_scale_y = dp_yres / (float)pixel_yres; + g_dpi_scale_real_x = g_dpi_scale_x; + g_dpi_scale_real_y = g_dpi_scale_y; +#endif + printf("Pixels: %i x %i\n", pixel_xres, pixel_yres); + printf("Virtual pixels: %i x %i\n", dp_xres, dp_yres); + + g_Config.iGPUBackend = (int)GPUBackend::GX2; + g_Config.bEnableSound = true; + g_Config.bPauseExitsEmulator = false; + g_Config.bPauseMenuExitsEmulator = false; + g_Config.iCpuCore = (int)CPUCore::JIT; + g_Config.bVertexDecoderJit = false; + g_Config.bSoftwareRendering = false; +// g_Config.iFpsLimit = 0; + g_Config.bHardwareTransform = true; + g_Config.bSoftwareSkinning = false; + g_Config.bVertexCache = true; +// g_Config.bTextureBackoffCache = true; + std::string error_string; + GraphicsContext *ctx; + host->InitGraphics(&error_string, &ctx); + NativeInitGraphics(ctx); + NativeResized(); + + host->InitSound(); + while (true) { + if (g_QuitRequested) + break; + + if (!Core_IsActive()) + UpdateUIState(UISTATE_MENU); + Core_Run(ctx); + } + host->ShutdownSound(); + + NativeShutdownGraphics(); + NativeShutdown(); + + return 0; +} + +std::string System_GetProperty(SystemProperty prop) { + switch (prop) { + case SYSPROP_NAME: + return "Wii-U"; + case SYSPROP_LANGREGION: + return "en_US"; + default: + return ""; + } +} + +int System_GetPropertyInt(SystemProperty prop) { + switch (prop) { + case SYSPROP_DISPLAY_REFRESH_RATE: + return 60000; // internal refresh rate is always 59.940, even for PAL output. + case SYSPROP_DISPLAY_XRES: + return 854; + case SYSPROP_DISPLAY_YRES: + return 480; + case SYSPROP_DEVICE_TYPE: + return DEVICE_TYPE_TV; + case SYSPROP_AUDIO_SAMPLE_RATE: + case SYSPROP_AUDIO_OPTIMAL_SAMPLE_RATE: + return 48000; + case SYSPROP_AUDIO_FRAMES_PER_BUFFER: + case SYSPROP_AUDIO_OPTIMAL_FRAMES_PER_BUFFER: + return AX_FRAME_SIZE; + case SYSPROP_SYSTEMVERSION: + default: + return -1; + } +} +bool System_GetPropertyBool(SystemProperty prop) { + switch (prop) { + case SYSPROP_APP_GOLD: +#ifdef GOLD + return true; +#else + return false; +#endif + default: + return false; + } +} + +float System_GetPropertyFloat(SystemProperty prop) { + return -1; +} + +void System_AskForPermission(SystemPermission permission) {} +PermissionStatus System_GetPermissionStatus(SystemPermission permission) { return PERMISSION_STATUS_GRANTED; } + +void LaunchBrowser(const char *url) {} +void ShowKeyboard() {} +void Vibrate(int length_ms) {} diff --git a/cmake/Toolchains/wiiu.cmake b/cmake/Toolchains/wiiu.cmake new file mode 100644 index 000000000000..5cd2bd4c2aec --- /dev/null +++ b/cmake/Toolchains/wiiu.cmake @@ -0,0 +1,108 @@ +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR powerpc) +set(WIIU ON) +set(CMAKE_CROSSCOMPILING ON) + +if(NOT DEFINED ENV{DEVKITPPC}) + message(FATAL_ERROR "Please set DEVKITPPC in your environment. export DEVKITPPC=/path/to/devkitPPC") +endif() + +if(NOT DEFINED ENV{DEVKITPRO}) + message(FATAL_ERROR "Please set DEVKITPRO in your environment. export DEVKITPRO=/path/to/devkitPRO") +endif() + + +set(CMAKE_WARN_DEPRECATED OFF) +INCLUDE(CMakeForceCompiler) +CMAKE_FORCE_C_COMPILER($ENV{DEVKITPPC}/bin/powerpc-eabi-gcc GNU) +CMAKE_FORCE_CXX_COMPILER($ENV{DEVKITPPC}/bin/powerpc-eabi-g++ GNU) +set(CMAKE_WARN_DEPRECATED ON) + +SET(CMAKE_FIND_ROOT_PATH $ENV{DEVKITPPC}) +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# specify the cross compiler +#SET(CMAKE_C_COMPILER $ENV{DEVKITPPC}/bin/powerpc-eabi-gcc) +#SET(CMAKE_CXX_COMPILER $ENV{DEVKITPPC}/bin/powerpc-eabi-g++) +#SET(CMAKE_ASM_COMPILER $ENV{DEVKITPPC}/bin/powerpc-eabi-as) + +if(NOT DEFINED WIIU_ROOT) + set(WIIU_ROOT ${CMAKE_SOURCE_DIR}/ext/wiiu) +endif() + +include_directories(${WIIU_ROOT}/include) +link_directories(${WIIU_ROOT}) + + +add_definitions(-Werror=format -Wno-format-truncation -Wno-format-overflow) +add_definitions(-D__wiiu__ -D__powerpc__ -DFD_SETSIZE=32) + +add_compile_options(-mcpu=750 -meabi -mhard-float) +add_compile_options(-msdata=eabi) +add_compile_options(-ffunction-sections -fdata-sections) +add_compile_options(-ftls-model=global-dynamic) + +add_link_options(-Wl,-z,common-page-size=64,-z,max-page-size=64,-z,nocopyreloc) +add_link_options(-Wl,--emit-relocs,--no-tls-optimize,--gc-sections) +add_link_options(-nostartfiles -T ${WIIU_ROOT}/link.ld) + +set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${ARCH_FLAGS} -mregnames -Wa,--sectname-subst") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + +if(NOT WIIU_LIBRARIES_ADDED) +add_library(wiiu STATIC + ${WIIU_ROOT}/missing_libc_functions.c + ${WIIU_ROOT}/exception_handler.c + ${WIIU_ROOT}/entry.c + ${WIIU_ROOT}/logger.c + ${WIIU_ROOT}/memory.c + ${WIIU_ROOT}/cxx_utils.cpp + ${WIIU_ROOT}/fs_utils.c + ${WIIU_ROOT}/sd_fat_devoptab.c + ${WIIU_ROOT}/gx2_validation.c + ${WIIU_ROOT}/shader_info.c + ${WIIU_ROOT}/stubs.S + ${WIIU_ROOT}/gthr-wup.cpp + ${WIIU_ROOT}/shader_disasm.cpp + ) +add_library(pthread STATIC ${WIIU_ROOT}/pthread.c) +add_library(fat STATIC + ${WIIU_ROOT}/libfat/bit_ops.h + ${WIIU_ROOT}/libfat/cache.c + ${WIIU_ROOT}/libfat/cache.h + ${WIIU_ROOT}/libfat/common.h + ${WIIU_ROOT}/libfat/directory.c + ${WIIU_ROOT}/libfat/directory.h + ${WIIU_ROOT}/libfat/disc.c + ${WIIU_ROOT}/libfat/disc.h + ${WIIU_ROOT}/libfat/fatdir.c + ${WIIU_ROOT}/libfat/fatdir.h + ${WIIU_ROOT}/libfat/fatfile.c + ${WIIU_ROOT}/libfat/fatfile.h + ${WIIU_ROOT}/libfat/file_allocation_table.c + ${WIIU_ROOT}/libfat/file_allocation_table.h + ${WIIU_ROOT}/libfat/filetime.c + ${WIIU_ROOT}/libfat/filetime.h + ${WIIU_ROOT}/libfat/libfat.c + ${WIIU_ROOT}/libfat/lock.c + ${WIIU_ROOT}/libfat/lock.h + ${WIIU_ROOT}/libfat/mem_allocate.h + ${WIIU_ROOT}/libfat/partition.c + ${WIIU_ROOT}/libfat/partition.h + ) +add_library(iosuhax STATIC + ${WIIU_ROOT}/libiosuhax/iosuhax.c + ${WIIU_ROOT}/libiosuhax/iosuhax_devoptab.c + ${WIIU_ROOT}/libiosuhax/iosuhax_disc_interface.c + ) +set(WIIU_LIBRARIES_ADDED TRUE) +endif() + +set(RPLTOOL "${WIIU_ROOT}/rpltool/rpltool") + +macro(add_rpx_target target) + add_custom_command(TARGET ${target} POST_BUILD + COMMAND ${RPLTOOL} ${target} -S -o ${target}.rpx) +endmacro() diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt index 16b4adbd7ebd..596c86d30943 100644 --- a/ext/CMakeLists.txt +++ b/ext/CMakeLists.txt @@ -1,22 +1,24 @@ set(ARMIPS_REGEXP OFF CACHE BOOL "" FORCE) - add_subdirectory(armips) -if(NOT USING_GLES2) - add_subdirectory(glew) -endif() -set(ENABLE_GLSLANG_BINARIES OFF CACHE BOOL "let's not build binaries we don't need" FORCE) -set(SPIRV_CROSS_EXCEPTIONS_TO_ASSERTIONS ON CACHE BOOL "let's not use exceptions" FORCE) +if(NOT WIIU) + if(NOT USING_GLES2) + add_subdirectory(glew) + endif() + + set(ENABLE_GLSLANG_BINARIES OFF CACHE BOOL "let's not build binaries we don't need" FORCE) + set(SPIRV_CROSS_EXCEPTIONS_TO_ASSERTIONS ON CACHE BOOL "let's not use exceptions" FORCE) -# This is really a workaround for an NDK 20 compiler issue (PPSSPP issue #12105), but shouldn't hurt. -if(ANDROID) -set(ENABLE_HLSL OFF CACHE BOOL "let's not build HLSL support we don't need" FORCE) + # This is really a workaround for an NDK 20 compiler issue (PPSSPP issue #12105), but shouldn't hurt. + if(ANDROID) + set(ENABLE_HLSL OFF CACHE BOOL "let's not build HLSL support we don't need" FORCE) + endif() + add_subdirectory(glslang EXCLUDE_FROM_ALL) + add_subdirectory(SPIRV-Cross-build) endif() -add_subdirectory(glslang EXCLUDE_FROM_ALL) add_subdirectory(snappy) add_subdirectory(udis86) -add_subdirectory(SPIRV-Cross-build) -if(USE_DISCORD AND NOT IOS AND NOT LIBRETRO) +if(USE_DISCORD AND NOT IOS AND NOT WIIU AND NOT LIBRETRO) add_subdirectory(discord-rpc-build) endif() diff --git a/ext/libkirk/amctrl.c b/ext/libkirk/amctrl.c index d144ca792c3c..2c6d20a2a69e 100644 --- a/ext/libkirk/amctrl.c +++ b/ext/libkirk/amctrl.c @@ -33,13 +33,14 @@ static u8 kirk_buf[0x0814]; // 1DC0 1DD4 static int do_kirk4(u8 *buf, int size, int type) { int retv; - u32 *header = (u32*)buf; + KIRK_AES128CBC_HEADER *header = (KIRK_AES128CBC_HEADER*)buf; - header[0] = 4; - header[1] = 0; - header[2] = 0; - header[3] = type; - header[4] = size; + + header->mode = 4; + header->unk_4 = 0; + header->unk_8 = 0; + header->keyseed = type; + header->data_size = size; retv = kirk_sceUtilsBufferCopyWithRange(buf, size+0x14, buf, size, 4); @@ -52,13 +53,13 @@ static int do_kirk4(u8 *buf, int size, int type) static int do_kirk7(u8 *buf, int size, int type) { int retv; - u32 *header = (u32*)buf; + KIRK_AES128CBC_HEADER *header = (KIRK_AES128CBC_HEADER*)buf; - header[0] = 5; - header[1] = 0; - header[2] = 0; - header[3] = type; - header[4] = size; + header->mode = 5; + header->unk_4 = 0; + header->unk_8 = 0; + header->keyseed = type; + header->data_size = size; retv = kirk_sceUtilsBufferCopyWithRange(buf, size+0x14, buf, size, 7); if(retv) @@ -70,13 +71,13 @@ static int do_kirk7(u8 *buf, int size, int type) static int kirk5(u8 *buf, int size) { int retv; - u32 *header = (u32*)buf; + KIRK_AES128CBC_HEADER *header = (KIRK_AES128CBC_HEADER*)buf; - header[0] = 4; - header[1] = 0; - header[2] = 0; - header[3] = 0x0100; - header[4] = size; + header->mode = 4; + header->unk_4 = 0; + header->unk_8 = 0; + header->keyseed = 0x0100; + header->data_size = size; retv = kirk_sceUtilsBufferCopyWithRange(buf, size+0x14, buf, size, 5); if(retv) @@ -88,13 +89,13 @@ static int kirk5(u8 *buf, int size) static int kirk8(u8 *buf, int size) { int retv; - u32 *header = (u32*)buf; + KIRK_AES128CBC_HEADER *header = (KIRK_AES128CBC_HEADER*)buf; - header[0] = 5; - header[1] = 0; - header[2] = 0; - header[3] = 0x0100; - header[4] = size; + header->mode = 5; + header->unk_4 = 0; + header->unk_8 = 0; + header->keyseed = 0x0100; + header->data_size = size; retv = kirk_sceUtilsBufferCopyWithRange(buf, size+0x14, buf, size, 8); if(retv) @@ -428,12 +429,12 @@ static int sub_428(u8 *kbuf, u8 *dbuf, int size, CIPHER_KEY *ckey) memset(tmp1, 0, 0x10); }else{ memcpy(tmp1, tmp2, 0x10); - *(u32*)(tmp1+0x0c) = ckey->seed-1; + ((KIRK_AES128CBC_BLOCK*)tmp1)->seed = ckey->seed-1; } for(i=0; iseed; + ((KIRK_AES128CBC_BLOCK*)(kbuf+0x14+i))->seed = ckey->seed; ckey->seed += 1; } @@ -600,8 +601,35 @@ int sceNpDrmGetFixedKey(u8 *key, char *npstr, int type) static const u8 dnas_key1A90[] = {0xED,0xE2,0x5D,0x2D,0xBB,0xF8,0x12,0xE5,0x3C,0x5C,0x59,0x32,0xFA,0xE3,0xE2,0x43}; static const u8 dnas_key1AA0[] = {0x27,0x74,0xFB,0xEB,0xA4,0xA0, 1,0xD7, 2,0x56,0x9E,0x33,0x8C,0x19,0x57,0x83}; +#ifdef __GNUC__ +#pragma scalar_storage_order little-endian +#endif +typedef struct { + u8 magic[0x4]; // 0x0 + u32 key_index; // 0x4 + u32 drm_type; // 0x8 + u32 unk0C; // 0xC + u8 header_key[0x10]; // 0x10 + u8 unk20[0x10]; // 0x20 + struct { + u8 dkey[0x10]; // 0x30 + u32 unk40; // 0x40 + u32 data_size; // 0x44 + u32 block_size; // 0x48 + u32 data_offset; // 0x4C + u8 unk50[0x10]; // 0x50 + } desc; + u8 unk60[0x10]; // 0x60 + u8 mac_70[0x10]; // 0x70 + u8 mac_80[0x10]; // 0x80 +} PGD_HEADER; +#ifdef __GNUC__ +#pragma scalar_storage_order default +#endif + PGD_DESC *pgd_open(u8 *pgd_buf, int pgd_flag, u8 *pgd_vkey) { + PGD_HEADER *pgd_header = (PGD_HEADER *)pgd_buf; PGD_DESC *pgd; MAC_KEY mkey; CIPHER_KEY ckey; @@ -613,8 +641,8 @@ PGD_DESC *pgd_open(u8 *pgd_buf, int pgd_flag, u8 *pgd_vkey) pgd = (PGD_DESC*)malloc(sizeof(PGD_DESC)); memset(pgd, 0, sizeof(PGD_DESC)); - pgd->key_index = *(u32*)(pgd_buf+4); - pgd->drm_type = *(u32*)(pgd_buf+8); + pgd->key_index = pgd_header->key_index; + pgd->drm_type = pgd_header->drm_type; if(pgd->drm_type==1){ pgd->mac_type = 1; @@ -644,8 +672,8 @@ PGD_DESC *pgd_open(u8 *pgd_buf, int pgd_flag, u8 *pgd_vkey) // MAC_0x80 check sceDrmBBMacInit(&mkey, pgd->mac_type); - sceDrmBBMacUpdate(&mkey, pgd_buf+0x00, 0x80); - retv = sceDrmBBMacFinal2(&mkey, pgd_buf+0x80, fkey); + sceDrmBBMacUpdate(&mkey, (u8*)pgd_header, 0x80); + retv = sceDrmBBMacFinal2(&mkey, pgd_header->mac_80, fkey); if(retv){ //ERROR_LOG(HLE, "pgd_open: MAC_80 check failed!: %08x(%d)\n", retv, retv); free(pgd); @@ -654,10 +682,10 @@ PGD_DESC *pgd_open(u8 *pgd_buf, int pgd_flag, u8 *pgd_vkey) // MAC_0x70 sceDrmBBMacInit(&mkey, pgd->mac_type); - sceDrmBBMacUpdate(&mkey, pgd_buf+0x00, 0x70); + sceDrmBBMacUpdate(&mkey, (u8*)pgd_header, 0x70); if(pgd_vkey){ // use given vkey - retv = sceDrmBBMacFinal2(&mkey, pgd_buf+0x70, pgd_vkey); + retv = sceDrmBBMacFinal2(&mkey, pgd_header->mac_70, pgd_vkey); if(retv){ //ERROR_LOG(HLE, "pgd_open: MAC_70 check failed!: %08x(%d)\n", retv, retv); free(pgd); @@ -667,18 +695,18 @@ PGD_DESC *pgd_open(u8 *pgd_buf, int pgd_flag, u8 *pgd_vkey) } }else{ // get vkey from MAC_70 - bbmac_getkey(&mkey, pgd_buf+0x70, pgd->vkey); + bbmac_getkey(&mkey, pgd_header->mac_70, pgd->vkey); } // decrypt PGD_DESC - sceDrmBBCipherInit(&ckey, pgd->cipher_type, 2, pgd_buf+0x10, pgd->vkey, 0); - sceDrmBBCipherUpdate(&ckey, pgd_buf+0x30, 0x30); + sceDrmBBCipherInit(&ckey, pgd->cipher_type, 2, pgd_header->header_key, pgd->vkey, 0); + sceDrmBBCipherUpdate(&ckey, (u8 *)&pgd_header->desc, sizeof(pgd_header->desc)); sceDrmBBCipherFinal(&ckey); - pgd->data_size = *(u32*)(pgd_buf+0x44); - pgd->block_size = *(u32*)(pgd_buf+0x48); - pgd->data_offset = *(u32*)(pgd_buf+0x4c); - memcpy(pgd->dkey, pgd_buf+0x30, 16); + pgd->data_size = pgd_header->desc.data_size; + pgd->block_size = pgd_header->desc.block_size; + pgd->data_offset = pgd_header->desc.data_offset; + memcpy(pgd->dkey, pgd_header->desc.dkey, 16); pgd->align_size = (pgd->data_size+15)&~15; pgd->table_offset = pgd->data_offset+pgd->align_size; diff --git a/ext/libkirk/kirk_engine.h b/ext/libkirk/kirk_engine.h index 5c7be012b709..86165816ece0 100644 --- a/ext/libkirk/kirk_engine.h +++ b/ext/libkirk/kirk_engine.h @@ -49,6 +49,18 @@ typedef unsigned int u32; #define KIRK_INVALID_SIZE 0xF #define KIRK_DATA_SIZE_ZERO 0x10 +#ifdef __GNUC__ +#pragma scalar_storage_order little-endian +#endif + +typedef struct +{ + int unk_0; //0 + int unk_4; //4 + int unk_8; //8 + int seed; //C +}KIRK_AES128CBC_BLOCK; + typedef struct { int mode; //0 @@ -133,6 +145,10 @@ typedef struct ECDSA_SIG signature; //3C } KIRK_CMD17_BUFFER;//0x64 +#ifdef __GNUC__ +#pragma scalar_storage_order default +#endif + //mode passed to sceUtilsBufferCopyWithRange #define KIRK_CMD_DECRYPT_PRIVATE 1 #define KIRK_CMD_2 2 diff --git a/ext/native/file/chunk_file.cpp b/ext/native/file/chunk_file.cpp index 20553a8657bd..852289252b14 100644 --- a/ext/native/file/chunk_file.cpp +++ b/ext/native/file/chunk_file.cpp @@ -3,6 +3,7 @@ #include "file/file_util.h" #include "Common/Log.h" +#include "Common/Swap.h" inline uint32_t flipID(uint32_t id) { return ((id >> 24) & 0xFF) | ((id >> 8) & 0xFF00) | ((id << 8) & 0xFF0000) | ((id << 24) & 0xFF000000); @@ -24,7 +25,7 @@ RIFFReader::~RIFFReader() { int RIFFReader::ReadInt() { if (data_ && pos_ < eof_ - 3) { pos_ += 4; - return *(int *)(data_ + pos_ - 4); + return *(s32_le *)(data_ + pos_ - 4); } return 0; } diff --git a/ext/native/file/fd_util.cpp b/ext/native/file/fd_util.cpp index 1b47c215fcb6..efb2cf2f462f 100644 --- a/ext/native/file/fd_util.cpp +++ b/ext/native/file/fd_util.cpp @@ -109,7 +109,12 @@ bool WaitUntilReady(int fd, double timeout, bool for_write) { } void SetNonBlocking(int sock, bool non_blocking) { -#ifndef _WIN32 +#if defined(__wiiu__) + u32 val = non_blocking ? 1 : 0; + if(setsockopt(sock, SOL_SOCKET, SO_NONBLOCK, &val, sizeof(val)) != 0) { + ERROR_LOG(IO, "Error setting socket nonblocking status"); + } +#elif !defined(_WIN32) int opts = fcntl(sock, F_GETFL); if (opts < 0) { perror("fcntl(F_GETFL)"); diff --git a/ext/native/file/path.cpp b/ext/native/file/path.cpp index 8025878c0225..4e3e14933c2d 100644 --- a/ext/native/file/path.cpp +++ b/ext/native/file/path.cpp @@ -207,10 +207,14 @@ bool PathBrowser::GetListing(std::vector &fileInfo, const char *filter guard.lock(); } -#ifdef _WIN32 +#if defined(_WIN32) || defined(__wiiu__) if (path_ == "/") { // Special path that means root of file system. +#ifdef _WIN32 std::vector drives = getWindowsDrives(); +#else + std::vector drives = {"sd:/", "usb:/"}; +#endif for (auto drive = drives.begin(); drive != drives.end(); ++drive) { if (*drive == "A:/" || *drive == "B:/") continue; @@ -223,6 +227,9 @@ bool PathBrowser::GetListing(std::vector &fileInfo, const char *filter fake.isWritable = false; fileInfo.push_back(fake); } +#ifdef __wiiu__ + return true; +#endif } #endif @@ -244,6 +251,10 @@ void PathBrowser::Navigate(const std::string &path) { // Check for windows drives. if (path_.size() == 3 && path_[1] == ':') { path_ = "/"; +#ifdef __wiiu__ + } else if (path_ == "sd:/" || path_ == "usb:/") { + path_ = "/"; +#endif } else { size_t slash = path_.rfind('/', path_.size() - 2); if (slash != std::string::npos) @@ -252,6 +263,10 @@ void PathBrowser::Navigate(const std::string &path) { } else { if (path.size() > 2 && path[1] == ':' && path_ == "/") path_ = path; +#ifdef __wiiu__ + else if (path == "sd:/" || path == "usb:/") + path_ = path; +#endif else path_ = path_ + path; if (path_[path_.size() - 1] != '/') diff --git a/ext/native/file/zip_read.cpp b/ext/native/file/zip_read.cpp index 7efcab72557b..b9252d48250a 100644 --- a/ext/native/file/zip_read.cpp +++ b/ext/native/file/zip_read.cpp @@ -279,11 +279,12 @@ void VFSShutdown() { static bool IsLocalPath(const char *path) { bool isUnixLocal = path[0] == '/'; #ifdef _WIN32 - bool isWindowsLocal = isalpha(path[0]) && path[1] == ':'; + return isUnixLocal || (isalpha(path[0]) && path[1] == ':'); +#elif defined(__wiiu__) + return isUnixLocal || !strncmp(path, "sd:/", 4) || !strncmp(path, "usb:/", 5); #else - bool isWindowsLocal = false; + return isUnixLocal; #endif - return isUnixLocal || isWindowsLocal; } // The returned data should be free'd with delete[]. diff --git a/ext/native/gfx/texture_atlas.h b/ext/native/gfx/texture_atlas.h index 58a2b86c96e4..60847ecdfcfc 100644 --- a/ext/native/gfx/texture_atlas.h +++ b/ext/native/gfx/texture_atlas.h @@ -2,6 +2,7 @@ #include #include +#include "Common/Swap.h" #define ATLAS_MAGIC ('A' + ('T' << 8) + ('L' << 16) | ('A' << 24)) @@ -71,28 +72,28 @@ struct FontID { struct AtlasChar { // texcoords - float sx, sy, ex, ey; + float_le sx, sy, ex, ey; // offset from the origin - float ox, oy; + float_le ox, oy; // distance to move the origin forward - float wx; + float_le wx; // size in pixels - unsigned short pw, ph; + u16_le pw, ph; }; struct AtlasCharRange { - int start; - int end; - int result_index; + s32_le start; + s32_le end; + s32_le result_index; }; struct AtlasFontHeader { - float padding; - float height; - float ascend; - float distslope; - int numRanges; - int numChars; + float_le padding; + float_le height; + float_le ascend; + float_le distslope; + s32_le numRanges; + s32_le numChars; char name[32]; }; @@ -114,16 +115,16 @@ struct AtlasFont { }; struct AtlasImage { - float u1, v1, u2, v2; - int w, h; + float_le u1, v1, u2, v2; + s32_le w, h; char name[32]; }; struct AtlasHeader { - int magic; - int version; - int numFonts; - int numImages; + s32_le magic; + s32_le version; + s32_le numFonts; + s32_le numImages; }; struct Atlas { diff --git a/ext/native/image/zim_load.cpp b/ext/native/image/zim_load.cpp index 86d2627f0ce4..5715e2668947 100644 --- a/ext/native/image/zim_load.cpp +++ b/ext/native/image/zim_load.cpp @@ -53,9 +53,9 @@ int LoadZIMPtr(const uint8_t *zim, size_t datasize, int *width, int *height, int ERROR_LOG(IO, "Not a ZIM file"); return 0; } - memcpy(width, zim + 4, 4); - memcpy(height, zim + 8, 4); - memcpy(flags, zim + 12, 4); + *width = zim[4] << 0 | zim[5] << 8 | zim[6] << 16 | zim[7] << 24; + *height = zim[8] << 0 | zim[9] << 8 | zim[10] << 16 | zim[11] << 24; + *flags = zim[12] << 0 | zim[13] << 8 | zim[14] << 16 | zim[15] << 24; int num_levels = 1; int image_data_size[ZIM_MAX_MIP_LEVELS]; diff --git a/ext/native/net/sinks.cpp b/ext/native/net/sinks.cpp index dbfa8643df59..3bdf185f1cf6 100644 --- a/ext/native/net/sinks.cpp +++ b/ext/native/net/sinks.cpp @@ -190,6 +190,9 @@ void InputSink::AccountFill(int bytes) { int err = WSAGetLastError(); if (err == WSAEWOULDBLOCK) return; +#elif PPSSPP_PLATFORM(WIIU) + if (socketlasterr() == SO_EWOULDBLOCK) + return; #else if (errno == EWOULDBLOCK || errno == EAGAIN) return; diff --git a/ext/native/thin3d/GX2Shaders.c b/ext/native/thin3d/GX2Shaders.c new file mode 100644 index 000000000000..e0a84a9fe70f --- /dev/null +++ b/ext/native/thin3d/GX2Shaders.c @@ -0,0 +1,292 @@ +#undef ARRAY_SIZE + +#include +#include +#include + +static GX2AttribVar attributes[] = +{ + { "position", GX2_SHADER_VAR_TYPE_FLOAT3, 0, 0}, + { "color", GX2_SHADER_VAR_TYPE_FLOAT4, 0, 1}, + { "tex_coord", GX2_SHADER_VAR_TYPE_FLOAT2, 0, 2}, +}; + +static GX2SamplerVar samplers[] = +{ + { "s", GX2_SAMPLER_VAR_TYPE_SAMPLER_2D, 0 }, +}; + +static GX2UniformBlock uniform_blocks[] = { + {"UBO", 1, 64} +}; + +static GX2UniformVar uniform_vars[] = { + {"global.MVP", GX2_SHADER_VAR_TYPE_MATRIX4X4, 1, 0, 0}, +}; + + +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[32]; + u64 alu[16]; +} vsColCode = +{ + { + CALL_FS NO_BARRIER, + ALU(32, 16) KCACHE0(CB1, _0_15), + EXP_DONE(POS0, _R1, _x, _y, _z, _w), + EXP_DONE(PARAM0, _R2, _x, _y, _z, _w) NO_BARRIER + END_OF_PROGRAM + }, + { + ALU_MUL(__,_x, _R1,_w, KC0(3),_y), + ALU_MUL(__,_y, _R1,_w, KC0(3),_x), + ALU_MUL(__,_z, _R1,_w, KC0(3),_w), + ALU_MUL(__,_w, _R1,_w, KC0(3),_z) + ALU_LAST, + ALU_MULADD(_R123,_x, _R1,_z, KC0(2),_y, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R1,_z, KC0(2),_x, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R1,_z, KC0(2),_w, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R1,_z, KC0(2),_z, ALU_SRC_PV,_w) + ALU_LAST, + ALU_MULADD(_R123,_x, _R1,_y, KC0(1),_y, ALU_SRC_PV,_x), + ALU_MULADD(_R123,_y, _R1,_y, KC0(1),_x, ALU_SRC_PV,_y), + ALU_MULADD(_R123,_z, _R1,_y, KC0(1),_w, ALU_SRC_PV,_z), + ALU_MULADD(_R123,_w, _R1,_y, KC0(1),_z, ALU_SRC_PV,_w) + ALU_LAST, + ALU_MULADD(_R1,_x, _R1,_x, KC0(0),_x, ALU_SRC_PV,_y), + ALU_MULADD(_R1,_y, _R1,_x, KC0(0),_y, ALU_SRC_PV,_x), + ALU_MULADD(_R1,_z, _R1,_x, KC0(0),_z, ALU_SRC_PV,_w), + ALU_MULADD(_R1,_w, _R1,_x, KC0(0),_w, ALU_SRC_PV,_z) + ALU_LAST, + } +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct +{ + u64 cf[32]; +} fsColCode = +{ + { + EXP_DONE(PIX0, _R0, _x, _y, _z, _w) + END_OF_PROGRAM + } +}; +// clang-format on +GX2VertexShader GX2_vsCol = { + { + .sq_pgm_resources_vs.num_gprs = 3, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 1, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x3, + .num_sq_vtx_semantic = 2, + { + 0, 1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(vsColCode), + .program = (uint8_t *)&vsColCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .uniformBlockCount = countof(uniform_blocks), uniform_blocks, + .uniformVarCount = countof(uniform_vars), uniform_vars, + .attribVarCount = 2, attributes, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +GX2PixelShader GX2_fsCol = { + { + .sq_pgm_resources_ps.num_gprs = 1, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 2, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 1, + { { .semantic = 0, .default_val = 1 }}, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + }, /* regs */ + .size = sizeof(fsColCode), + .program = (uint8_t *)&fsColCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; +// clang-format off +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) +static struct { + u64 cf[32]; + u64 alu[16]; +} vsTexColCode = { + { + CALL_FS NO_BARRIER, + ALU(32, 16) KCACHE0(CB1, _0_15), + EXP_DONE(POS0, _R1, _x, _y, _z, _w), + EXP(PARAM0, _R2, _x, _y, _z, _w) NO_BARRIER, + EXP_DONE(PARAM1, _R3, _x, _y, _0, _0) NO_BARRIER + END_OF_PROGRAM + }, + { + ALU_MUL(__, _x, _R1, _w, KC0(3), _y), + ALU_MUL(__, _y, _R1, _w, KC0(3), _x), + ALU_MUL(__, _z, _R1, _w, KC0(3), _w), + ALU_MUL(__, _w, _R1, _w, KC0(3), _z) + ALU_LAST, + ALU_MULADD(_R123, _x, _R1, _z, KC0(2), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R123, _y, _R1, _z, KC0(2), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R123, _z, _R1, _z, KC0(2), _w, ALU_SRC_PV, _z), + ALU_MULADD(_R123, _w, _R1, _z, KC0(2), _z, ALU_SRC_PV, _w) + ALU_LAST, + ALU_MULADD(_R123, _x, _R1, _y, KC0(1), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R123, _y, _R1, _y, KC0(1), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R123, _z, _R1, _y, KC0(1), _w, ALU_SRC_PV, _z), + ALU_MULADD(_R123, _w, _R1, _y, KC0(1), _z, ALU_SRC_PV, _w) + ALU_LAST, + ALU_MULADD(_R1, _x, _R1, _x, KC0(0), _x, ALU_SRC_PV, _y), + ALU_MULADD(_R1, _y, _R1, _x, KC0(0), _y, ALU_SRC_PV, _x), + ALU_MULADD(_R1, _z, _R1, _x, KC0(0), _z, ALU_SRC_PV, _w), + ALU_MULADD(_R1, _w, _R1, _x, KC0(0), _w, ALU_SRC_PV, _z) + ALU_LAST, + } +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) static struct { + u64 cf[32]; + u64 alu[16]; + u64 tex[1 * 2]; +} fsTexColCode = +{ + { + TEX(48, 1) VALID_PIX, + ALU(32, 4), + EXP_DONE(PIX0, _R0, _x, _y, _z, _w) + END_OF_PROGRAM + }, + { + ALU_MUL(_R0, _x, _R0, _x, _R1, _x), + ALU_MUL(_R0, _y, _R0, _y, _R1, _y), + ALU_MUL(_R0, _z, _R0, _z, _R1, _z), + ALU_MUL(_R0, _w, _R0, _w, _R1, _w) + ALU_LAST + }, + { + TEX_SAMPLE(_R1, _x, _y, _z, _w, _R1, _x, _y, _0, _0, _t0, _s0) + } +}; + +__attribute__((aligned(GX2_SHADER_ALIGNMENT))) static struct { + u64 cf[32]; + u64 alu[16]; + u64 tex[1 * 2]; +} fsTexColCode_sw = +{ + { + TEX(48, 1) VALID_PIX, + ALU(32, 4), + EXP_DONE(PIX0, _R0, _z, _y, _x, _w) + END_OF_PROGRAM + }, + { + ALU_MUL(_R0, _x, _R0, _x, _R1, _x), + ALU_MUL(_R0, _y, _R0, _y, _R1, _y), + ALU_MUL(_R0, _z, _R0, _z, _R1, _z), + ALU_MUL(_R0, _w, _R0, _w, _R1, _w) + ALU_LAST + }, + { + TEX_SAMPLE(_R1, _x, _y, _z, _w, _R1, _x, _y, _0, _0, _t0, _s0) + } +}; +// clang-format on + +GX2VertexShader GX2_vsTexCol = { + { + .sq_pgm_resources_vs.num_gprs = 4, + .sq_pgm_resources_vs.stack_size = 1, + .spi_vs_out_config.vs_export_count = 1, + .num_spi_vs_out_id = 1, + { + { .semantic_0 = 0x00, .semantic_1 = 0x01, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + { .semantic_0 = 0xFF, .semantic_1 = 0xFF, .semantic_2 = 0xFF, .semantic_3 = 0xFF }, + }, + .sq_vtx_semantic_clear = ~0x7, + .num_sq_vtx_semantic = 3, + { + 0, 1, 2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }, + .vgt_vertex_reuse_block_cntl.vtx_reuse_depth = 0xE, + .vgt_hos_reuse_depth.reuse_depth = 0x10, + }, /* regs */ + .size = sizeof(vsTexColCode), + .program = (uint8_t *)&vsTexColCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .uniformBlockCount = countof(uniform_blocks), uniform_blocks, + .uniformVarCount = countof(uniform_vars), uniform_vars, + .attribVarCount = 3, attributes, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +GX2PixelShader GX2_fsTexCol = { + { + .sq_pgm_resources_ps.num_gprs = 2, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 2, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 2, + { { .semantic = 0, .default_val = 1 }, { .semantic = 1, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + }, /* regs */ + .size = sizeof(fsTexColCode), + .program = (uint8_t *)&fsTexColCode, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .samplerVarCount = countof(samplers), samplers, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; + +GX2PixelShader GX2_fsTexCol_sw = { + { + .sq_pgm_resources_ps.num_gprs = 2, + .sq_pgm_exports_ps.export_mode = 0x2, + .spi_ps_in_control_0.num_interp = 2, + .spi_ps_in_control_0.persp_gradient_ena = 1, + .spi_ps_in_control_0.baryc_sample_cntl = spi_baryc_cntl_centers_only, + .num_spi_ps_input_cntl = 2, + { { .semantic = 0, .default_val = 1 }, { .semantic = 1, .default_val = 1 } }, + .cb_shader_mask.output0_enable = 0xF, + .cb_shader_control.rt0_enable = TRUE, + .db_shader_control.z_order = db_z_order_early_z_then_late_z, + }, /* regs */ + .size = sizeof(fsTexColCode_sw), + .program = (uint8_t *)&fsTexColCode_sw, + .mode = GX2_SHADER_MODE_UNIFORM_BLOCK, + .samplerVarCount = countof(samplers), samplers, + .gx2rBuffer.flags = GX2R_RESOURCE_LOCKED_READ_ONLY, +}; diff --git a/ext/native/thin3d/thin3d.cpp b/ext/native/thin3d/thin3d.cpp index aabb1d818b61..f1ef9f695ff4 100644 --- a/ext/native/thin3d/thin3d.cpp +++ b/ext/native/thin3d/thin3d.cpp @@ -401,10 +401,10 @@ DrawContext::~DrawContext() { // Could also make C fake-simd for 64-bit, two 8888 pixels fit in a register :) void ConvertFromRGBA8888(uint8_t *dst, const uint8_t *src, uint32_t dstStride, uint32_t srcStride, uint32_t width, uint32_t height, DataFormat format) { // Must skip stride in the cases below. Some games pack data into the cracks, like MotoGP. - const uint32_t *src32 = (const uint32_t *)src; + const u32_le *src32 = (const u32_le *)src; if (format == Draw::DataFormat::R8G8B8A8_UNORM) { - uint32_t *dst32 = (uint32_t *)dst; + u32_le *dst32 = (u32_le *)dst; if (src == dst) { return; } else { @@ -424,7 +424,7 @@ void ConvertFromRGBA8888(uint8_t *dst, const uint8_t *src, uint32_t dstStride, u } } else { // But here it shouldn't matter if they do intersect - uint16_t *dst16 = (uint16_t *)dst; + u16_le *dst16 = (u16_le *)dst; switch (format) { case Draw::DataFormat::R5G6B5_UNORM_PACK16: // BGR 565 for (uint32_t y = 0; y < height; ++y) { @@ -460,10 +460,10 @@ void ConvertFromRGBA8888(uint8_t *dst, const uint8_t *src, uint32_t dstStride, u // Could also make C fake-simd for 64-bit, two 8888 pixels fit in a register :) void ConvertFromBGRA8888(uint8_t *dst, const uint8_t *src, uint32_t dstStride, uint32_t srcStride, uint32_t width, uint32_t height, DataFormat format) { // Must skip stride in the cases below. Some games pack data into the cracks, like MotoGP. - const uint32_t *src32 = (const uint32_t *)src; + const u32_le *src32 = (const u32_le *)src; if (format == Draw::DataFormat::B8G8R8A8_UNORM) { - uint32_t *dst32 = (uint32_t *)dst; + u32_le *dst32 = (u32_le *)dst; if (src == dst) { return; } else { @@ -474,7 +474,7 @@ void ConvertFromBGRA8888(uint8_t *dst, const uint8_t *src, uint32_t dstStride, u } } } else if (format == Draw::DataFormat::R8G8B8A8_UNORM) { - uint32_t *dst32 = (uint32_t *)dst; + u32_le *dst32 = (u32_le *)dst; for (uint32_t y = 0; y < height; ++y) { ConvertBGRA8888ToRGBA8888(dst32, src32, width); src32 += srcStride; diff --git a/ext/native/thin3d/thin3d.h b/ext/native/thin3d/thin3d.h index e3f6390afee4..1ac1092e12bd 100644 --- a/ext/native/thin3d/thin3d.h +++ b/ext/native/thin3d/thin3d.h @@ -562,7 +562,7 @@ struct RenderPassInfo { class DrawContext { public: virtual ~DrawContext(); - bool CreatePresets(); + virtual bool CreatePresets(); void DestroyPresets(); Bugs GetBugs() const { return bugs_; } diff --git a/ext/native/thin3d/thin3d_create.h b/ext/native/thin3d/thin3d_create.h index a89c74ab96a7..a812f234c74e 100644 --- a/ext/native/thin3d/thin3d_create.h +++ b/ext/native/thin3d/thin3d_create.h @@ -22,6 +22,9 @@ struct ID3D11DeviceContext; struct ID3D11Device1; struct ID3D11DeviceContext1; +#endif +#ifdef __wiiu__ +#include #endif class VulkanContext; @@ -37,4 +40,8 @@ DrawContext *T3DCreateD3D11Context(ID3D11Device *device, ID3D11DeviceContext *co DrawContext *T3DCreateVulkanContext(VulkanContext *context, bool splitSubmit); +#ifdef __wiiu__ +DrawContext *T3DCreateGX2Context(GX2ContextState *context_state, GX2ColorBuffer* color_buffer, GX2DepthBuffer *depth_buffer); +#endif + } // namespace Draw diff --git a/ext/native/thin3d/thin3d_gx2.cpp b/ext/native/thin3d/thin3d_gx2.cpp new file mode 100644 index 000000000000..f1c521abf421 --- /dev/null +++ b/ext/native/thin3d/thin3d_gx2.cpp @@ -0,0 +1,1064 @@ +#include "ppsspp_config.h" + +#include "profiler/profiler.h" +#include "thin3d/thin3d.h" +#include "base/display.h" +#include "math/dataconv.h" +#include "util/text/utf8.h" +#include "Common/ColorConv.h" +#include +#include + +#include +#include +#include + +extern "C" GX2VertexShader GX2_vsTexCol, GX2_vsCol; +extern "C" GX2PixelShader GX2_fsTexCol, GX2_fsTexCol_sw, GX2_fsCol; + +namespace Draw { + +static const GX2CompareFunction compareToGX2[] = { GX2_COMPARE_FUNC_NEVER, GX2_COMPARE_FUNC_LESS, GX2_COMPARE_FUNC_EQUAL, GX2_COMPARE_FUNC_LEQUAL, GX2_COMPARE_FUNC_GREATER, GX2_COMPARE_FUNC_NOT_EQUAL, GX2_COMPARE_FUNC_GEQUAL, GX2_COMPARE_FUNC_ALWAYS }; + +static const GX2StencilFunction stencilOpToGX2[] = { + GX2_STENCIL_FUNCTION_KEEP, GX2_STENCIL_FUNCTION_ZERO, GX2_STENCIL_FUNCTION_REPLACE, GX2_STENCIL_FUNCTION_INCR_CLAMP, GX2_STENCIL_FUNCTION_DECR_CLAMP, GX2_STENCIL_FUNCTION_INV, GX2_STENCIL_FUNCTION_INCR_WRAP, GX2_STENCIL_FUNCTION_DECR_WRAP, +}; +static GX2PrimitiveMode primToGX2[] = { + GX2_PRIMITIVE_MODE_POINTS, GX2_PRIMITIVE_MODE_LINES, GX2_PRIMITIVE_MODE_LINE_STRIP, GX2_PRIMITIVE_MODE_TRIANGLES, GX2_PRIMITIVE_MODE_TRIANGLE_STRIP, GX2_PRIMITIVE_MODE_INVALID, + // Tesselation shader only + GX2_PRIMITIVE_MODE_INVALID, // GX2_PRIMITIVE_MODE_CONTROL_POINT_PATCHLIST, // ??? + // These are for geometry shaders only. + GX2_PRIMITIVE_MODE_INVALID, // GX2_PRIMITIVE_MODE_LINELIST_ADJ, + GX2_PRIMITIVE_MODE_INVALID, // GX2_PRIMITIVE_MODE_LINESTRIP_ADJ, + GX2_PRIMITIVE_MODE_INVALID, // GX2_PRIMITIVE_MODE_TRIANGLELIST_ADJ, + GX2_PRIMITIVE_MODE_INVALID, // GX2_PRIMITIVE_MODE_TRIANGLESTRIP_ADJ, +}; + +static const GX2BlendCombineMode blendOpToGX2[] = { + GX2_BLEND_COMBINE_MODE_ADD, GX2_BLEND_COMBINE_MODE_SUB, GX2_BLEND_COMBINE_MODE_REV_SUB, GX2_BLEND_COMBINE_MODE_MIN, GX2_BLEND_COMBINE_MODE_MAX, +}; + +static const GX2BlendMode blendToGX2[] = { + GX2_BLEND_MODE_ZERO, GX2_BLEND_MODE_ONE, GX2_BLEND_MODE_SRC_COLOR, GX2_BLEND_MODE_INV_SRC_COLOR, GX2_BLEND_MODE_DST_COLOR, GX2_BLEND_MODE_INV_DST_COLOR, GX2_BLEND_MODE_SRC_ALPHA, GX2_BLEND_MODE_INV_SRC_ALPHA, GX2_BLEND_MODE_DST_ALPHA, GX2_BLEND_MODE_INV_DST_ALPHA, GX2_BLEND_MODE_BLEND_FACTOR, GX2_BLEND_MODE_INV_BLEND_FACTOR, GX2_BLEND_MODE_BLEND_FACTOR, GX2_BLEND_MODE_INV_BLEND_FACTOR, GX2_BLEND_MODE_SRC1_COLOR, GX2_BLEND_MODE_INV_SRC1_COLOR, GX2_BLEND_MODE_SRC1_ALPHA, GX2_BLEND_MODE_INV_SRC1_ALPHA, +}; + +static const GX2LogicOp logicOpToGX2[] = { + GX2_LOGIC_OP_CLEAR, GX2_LOGIC_OP_SET, GX2_LOGIC_OP_COPY, GX2_LOGIC_OP_INV_COPY, GX2_LOGIC_OP_NOP, GX2_LOGIC_OP_INV, GX2_LOGIC_OP_AND, GX2_LOGIC_OP_NOT_AND, GX2_LOGIC_OP_OR, GX2_LOGIC_OP_NOR, GX2_LOGIC_OP_XOR, GX2_LOGIC_OP_EQUIV, GX2_LOGIC_OP_REV_AND, GX2_LOGIC_OP_INV_AND, GX2_LOGIC_OP_REV_OR, GX2_LOGIC_OP_INV_OR, +}; + +static const GX2TexClampMode taddrToGX2[] = { + GX2_TEX_CLAMP_MODE_WRAP, + GX2_TEX_CLAMP_MODE_MIRROR, + GX2_TEX_CLAMP_MODE_CLAMP, + GX2_TEX_CLAMP_MODE_CLAMP_BORDER, +}; +static GX2SurfaceFormat dataFormatToGX2SurfaceFormat(DataFormat format) { + switch (format) { + case DataFormat::R32_FLOAT: return GX2_SURFACE_FORMAT_FLOAT_R32; + case DataFormat::R32G32_FLOAT: + return GX2_SURFACE_FORMAT_FLOAT_R32_G32; + // case DataFormat::R32G32B32_FLOAT: + // return GX2_SURFACE_FORMAT_FLOAT_R32_G32_B32; + case DataFormat::R32G32B32A32_FLOAT: return GX2_SURFACE_FORMAT_FLOAT_R32_G32_B32_A32; + case DataFormat::A4R4G4B4_UNORM_PACK16: return GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4; + case DataFormat::A1R5G5B5_UNORM_PACK16: return GX2_SURFACE_FORMAT_UNORM_A1_B5_G5_R5; + case DataFormat::R5G5B5A1_UNORM_PACK16: return GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1; + case DataFormat::R5G6B5_UNORM_PACK16: return GX2_SURFACE_FORMAT_UNORM_R5_G6_B5; + case DataFormat::R8G8B8_UNORM: + case DataFormat::R8G8B8A8_UNORM: return GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; + case DataFormat::R8G8B8A8_UNORM_SRGB: return GX2_SURFACE_FORMAT_SRGB_R8_G8_B8_A8; + case DataFormat::R16_FLOAT: return GX2_SURFACE_FORMAT_FLOAT_R16; + case DataFormat::R16G16_FLOAT: return GX2_SURFACE_FORMAT_FLOAT_R16_G16; + case DataFormat::R16G16B16A16_FLOAT: return GX2_SURFACE_FORMAT_FLOAT_R16_G16_B16_A16; + + case DataFormat::D16: return GX2_SURFACE_FORMAT_UNORM_D16; + case DataFormat::D24_S8: return GX2_SURFACE_FORMAT_UNORM_D24_S8; + case DataFormat::S8: return GX2_SURFACE_FORMAT_INVALID; + case DataFormat::D32F: return GX2_SURFACE_FORMAT_FLOAT_D32; + case DataFormat::D32F_S8: return GX2_SURFACE_FORMAT_FLOAT_D32_UINT_S8_X24; + + case DataFormat::ETC1: + default: return GX2_SURFACE_FORMAT_INVALID; + } +} + +static u32 dataFormatToGX2SurfaceCompSelect(DataFormat format) { + switch (format) { + case DataFormat::R16_FLOAT: + case DataFormat::R32_FLOAT: return GX2_COMP_SEL(_r, _0, _0, _1); + case DataFormat::R16G16_FLOAT: + case DataFormat::R32G32_FLOAT: return GX2_COMP_SEL(_r, _g, _0, _1); + case DataFormat::R8G8B8_UNORM: + case DataFormat::R5G6B5_UNORM_PACK16: return GX2_COMP_SEL(_r, _g, _b, _1); + case DataFormat::B8G8R8A8_UNORM: + case DataFormat::B8G8R8A8_UNORM_SRGB: return GX2_COMP_SEL(_b, _g, _r, _a); + default: return GX2_COMP_SEL(_r, _g, _b, _a); + } +} + +static int dataFormatToSwapSize(DataFormat format) { + switch (format) { + case DataFormat::A4R4G4B4_UNORM_PACK16: + case DataFormat::B4G4R4A4_UNORM_PACK16: + case DataFormat::R4G4B4A4_UNORM_PACK16: + case DataFormat::R5G6B5_UNORM_PACK16: + case DataFormat::B5G6R5_UNORM_PACK16: + case DataFormat::R5G5B5A1_UNORM_PACK16: + case DataFormat::B5G5R5A1_UNORM_PACK16: + case DataFormat::A1R5G5B5_UNORM_PACK16: + case DataFormat::R16_FLOAT: + case DataFormat::D16: + case DataFormat::R16G16_FLOAT: + case DataFormat::R16G16B16A16_FLOAT: return 2; + default: return 4; + } +} + +static GX2AttribFormat dataFormatToGX2AttribFormat(DataFormat format) { + switch (format) { + case DataFormat::R8_UNORM: return GX2_ATTRIB_FORMAT_UNORM_8; + case DataFormat::R8G8_UNORM: return GX2_ATTRIB_FORMAT_UNORM_8_8; + case DataFormat::B8G8R8A8_UNORM: + case DataFormat::R8G8B8A8_UNORM: return GX2_ATTRIB_FORMAT_UNORM_8_8_8_8; + case DataFormat::R8G8B8A8_UINT: return GX2_ATTRIB_FORMAT_UINT_8_8_8_8; + case DataFormat::R8G8B8A8_SNORM: return GX2_ATTRIB_FORMAT_SNORM_8_8_8_8; + case DataFormat::R8G8B8A8_SINT: return GX2_ATTRIB_FORMAT_SINT_8_8_8_8; + case DataFormat::R32_FLOAT: return GX2_ATTRIB_FORMAT_FLOAT_32; + case DataFormat::R32G32_FLOAT: return GX2_ATTRIB_FORMAT_FLOAT_32_32; + case DataFormat::R32G32B32_FLOAT: return GX2_ATTRIB_FORMAT_FLOAT_32_32_32; + case DataFormat::R32G32B32A32_FLOAT: return GX2_ATTRIB_FORMAT_FLOAT_32_32_32_32; + + default: return (GX2AttribFormat)-1; + } +} +static u32 dataFormatToGX2AttribCompSelect(DataFormat format) { + switch (format) { + case DataFormat::R8_UNORM: + case DataFormat::R32_FLOAT: return GX2_COMP_SEL(_x, _0, _0, _1); + case DataFormat::R8G8_UNORM: + case DataFormat::R32G32_FLOAT: return GX2_COMP_SEL(_x, _y, _0, _1); + case DataFormat::R32G32B32_FLOAT: return GX2_COMP_SEL(_x, _y, _z, _1); + case DataFormat::R8G8B8A8_UNORM_SRGB: + case DataFormat::B8G8R8A8_UNORM: + case DataFormat::B8G8R8A8_UNORM_SRGB: return GX2_COMP_SEL(_b, _g, _r, _a); + case DataFormat::R8G8B8A8_UNORM: + case DataFormat::R8G8B8A8_SNORM: + case DataFormat::R8G8B8A8_UINT: + case DataFormat::R8G8B8A8_SINT: return GX2_COMP_SEL(_a, _b, _g, _r); + default: return GX2_COMP_SEL(_x, _y, _z, _w); + } +} + +class GX2VertexShaderModule : public ShaderModule { +public: + GX2VertexShaderModule(GX2VertexShader *shader) : shader_(shader) {} + ~GX2VertexShaderModule() { + if (shader_->gx2rBuffer.flags & GX2R_RESOURCE_LOCKED_READ_ONLY) + return; + free(shader_); + } + ShaderStage GetStage() const { return ShaderStage::VERTEX; } + + GX2VertexShader *shader_; +}; + +class GX2PixelShaderModule : public ShaderModule { +public: + GX2PixelShaderModule(GX2PixelShader *shader) : shader_(shader) {} + ~GX2PixelShaderModule() { + if (shader_->gx2rBuffer.flags & GX2R_RESOURCE_LOCKED_READ_ONLY) + return; + free(shader_); + } + ShaderStage GetStage() const { return ShaderStage::FRAGMENT; } + + GX2PixelShader *shader_; +}; + +class GX2GeometryShaderModule : public ShaderModule { +public: + GX2GeometryShaderModule(GX2GeometryShader *shader) : shader_(shader) {} + ~GX2GeometryShaderModule() { + if (shader_->gx2rBuffer.flags & GX2R_RESOURCE_LOCKED_READ_ONLY) + return; + free(shader_); + } + ShaderStage GetStage() const { return ShaderStage::GEOMETRY; } + + GX2GeometryShader *shader_; +}; + +class GX2Buffer : public Buffer { +public: + GX2Buffer(size_t size, uint32_t usageFlags) : size_(size) { + int align; + switch (usageFlags & 0xF) { + case VERTEXDATA: + align = GX2_VERTEX_BUFFER_ALIGNMENT; + invMode_ = GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER; + break; + case INDEXDATA: + align = GX2_INDEX_BUFFER_ALIGNMENT; + invMode_ = GX2_INVALIDATE_MODE_CPU_ATTRIBUTE_BUFFER; + break; + case UNIFORM: + needswap = true; + size_ = (size_ + 0x3F) & ~0x3F; + /* fallthrough */ + default: + case GENERIC: align = GX2_UNIFORM_BLOCK_ALIGNMENT; invMode_ = GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK; + } + data_ = (u8 *)MEM2_alloc(size_, align); + } + ~GX2Buffer() { MEM2_free(data_); } + + size_t size_; + u8 *data_; + GX2InvalidateMode invMode_; + bool needswap = false; +}; + +class GX2DepthStencilState : public DepthStencilState { +public: + GX2DepthStencilState(const DepthStencilStateDesc &desc) { GX2InitDepthStencilControlReg(®_, desc.depthTestEnabled, desc.depthWriteEnabled, compareToGX2[(int)desc.depthCompare], desc.stencilEnabled, desc.stencilEnabled, compareToGX2[(int)desc.front.compareOp], stencilOpToGX2[(int)desc.front.passOp], stencilOpToGX2[(int)desc.front.depthFailOp], stencilOpToGX2[(int)desc.front.failOp], compareToGX2[(int)desc.back.compareOp], stencilOpToGX2[(int)desc.back.passOp], stencilOpToGX2[(int)desc.back.depthFailOp], stencilOpToGX2[(int)desc.back.failOp]); } + ~GX2DepthStencilState() {} + GX2DepthStencilControlReg reg_; +}; + +class GX2BlendState : public BlendState { +public: + GX2BlendState(const BlendStateDesc &desc) { + GX2InitBlendControlReg(®, GX2_RENDER_TARGET_0, blendToGX2[(int)desc.srcCol], blendToGX2[(int)desc.dstCol], blendOpToGX2[(int)desc.eqCol], (int)desc.srcAlpha && (int)desc.dstAlpha, blendToGX2[(int)desc.srcAlpha], blendToGX2[(int)desc.dstAlpha], blendOpToGX2[(int)desc.eqAlpha]); + GX2InitColorControlReg(&color_reg, desc.logicEnabled ? logicOpToGX2[(int)desc.logicOp] : GX2_LOGIC_OP_COPY, desc.enabled ? 0xFF : 0x00, false, desc.colorMask != 0); + GX2InitTargetChannelMasksReg(&mask_reg, (GX2ChannelMask)desc.colorMask, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0, (GX2ChannelMask)0); + logicEnabled = desc.logicEnabled; + } + ~GX2BlendState() {} + GX2BlendControlReg reg; + GX2ColorControlReg color_reg; + GX2TargetChannelMaskReg mask_reg; + bool logicEnabled; +}; + +class GX2RasterState : public RasterState { +public: + GX2RasterState(const RasterStateDesc &desc) { + frontFace_ = desc.frontFace == Facing::CW ? GX2_FRONT_FACE_CW : GX2_FRONT_FACE_CCW; + cullFront_ = desc.cull == CullMode::FRONT || desc.cull == CullMode::FRONT_AND_BACK; + cullBack_ = desc.cull == CullMode::BACK || desc.cull == CullMode::FRONT_AND_BACK; + } + ~GX2RasterState() {} + GX2FrontFace frontFace_; + BOOL cullFront_; + BOOL cullBack_; +}; + +class GX2SamplerState : public SamplerState { +public: + GX2SamplerState(const SamplerStateDesc &desc) { + static const GX2TexBorderType borderColorToGX2[] = { + GX2_TEX_BORDER_TYPE_TRANSPARENT_BLACK, + GX2_TEX_BORDER_TYPE_TRANSPARENT_BLACK, + GX2_TEX_BORDER_TYPE_BLACK, + GX2_TEX_BORDER_TYPE_WHITE, + }; + + GX2InitSampler(&sampler_, taddrToGX2[(int)desc.wrapU], (GX2TexXYFilterMode)desc.magFilter); + GX2InitSamplerBorderType(&sampler_, borderColorToGX2[(int)desc.borderColor]); + GX2InitSamplerClamping(&sampler_, taddrToGX2[(int)desc.wrapU], taddrToGX2[(int)desc.wrapV], taddrToGX2[(int)desc.wrapW]); + if (desc.shadowCompareEnabled) + GX2InitSamplerDepthCompare(&sampler_, compareToGX2[(int)desc.shadowCompareFunc]); + GX2InitSamplerLOD(&sampler_, 0.0f, desc.maxLod, 0.0f); + GX2InitSamplerXYFilter(&sampler_, (GX2TexXYFilterMode)desc.magFilter, (GX2TexXYFilterMode)desc.minFilter, GX2_TEX_ANISO_RATIO_NONE); + GX2InitSamplerZMFilter(&sampler_, (GX2TexZFilterMode)((int)desc.mipFilter + 1), (GX2TexMipFilterMode)((int)desc.mipFilter + 1)); + } + ~GX2SamplerState() {} + GX2Sampler sampler_ = {}; +}; + +class GX2InputLayout : public InputLayout { +public: + GX2InputLayout(const InputLayoutDesc &desc) { + for (size_t i = 0; i < desc.attributes.size(); i++) { + GX2AttribStream el; + el.location = desc.attributes[i].location; + el.buffer = desc.attributes[i].binding; + el.offset = desc.attributes[i].offset; + el.format = dataFormatToGX2AttribFormat(desc.attributes[i].format); + el.type = desc.bindings[desc.attributes[i].binding].instanceRate ? GX2_ATTRIB_INDEX_PER_INSTANCE : GX2_ATTRIB_INDEX_PER_VERTEX; + el.aluDivisor = 0; + el.mask = dataFormatToGX2AttribCompSelect(desc.attributes[i].format); + el.endianSwap = GX2_ENDIAN_SWAP_DEFAULT; + attribute_stream.push_back(el); + } + for (size_t i = 0; i < desc.bindings.size(); i++) { + strides.push_back(desc.bindings[i].stride); + } + fs.size = GX2CalcFetchShaderSizeEx(desc.attributes.size(), GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE); + fs.program = (u8 *)MEM2_alloc(fs.size, GX2_SHADER_ALIGNMENT); + GX2InitFetchShaderEx(&fs, fs.program, desc.attributes.size(), attribute_stream.data(), GX2_FETCH_SHADER_TESSELLATION_NONE, GX2_TESSELLATION_MODE_DISCRETE); + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_SHADER, fs.program, fs.size); + } + ~GX2InputLayout() { MEM2_free(fs.program); } + std::vector attribute_stream; + std::vector strides; + GX2FetchShader fs = {}; +}; + +class GX2Pipeline : public Pipeline { +public: + GX2Pipeline(const PipelineDesc &desc) { + prim_ = primToGX2[(int)desc.prim]; + inputLayout_ = (GX2InputLayout *)desc.inputLayout; + inputLayout_->AddRef(); + depthStencil_ = (GX2DepthStencilState *)desc.depthStencil; + depthStencil_->AddRef(); + blend_ = (GX2BlendState *)desc.blend; + blend_->AddRef(); + raster_ = (GX2RasterState *)desc.raster; + raster_->AddRef(); + + for (ShaderModule *shader : desc.shaders) { + switch (shader->GetStage()) { + case ShaderStage::VERTEX: + vs_ = (GX2VertexShaderModule *)shader; + vs_->AddRef(); + break; + case ShaderStage::FRAGMENT: + ps_ = (GX2PixelShaderModule *)shader; + ps_->AddRef(); + break; + case ShaderStage::GEOMETRY: + gs_ = (GX2GeometryShaderModule *)shader; + gs_->AddRef(); + break; + } + } + if (desc.uniformDesc->uniformBufferSize) + ubo = new GX2Buffer(desc.uniformDesc->uniformBufferSize, BufferUsageFlag::DYNAMIC | BufferUsageFlag::UNIFORM); + } + ~GX2Pipeline() { + inputLayout_->Release(); + depthStencil_->Release(); + blend_->Release(); + raster_->Release(); + if (vs_) + vs_->Release(); + if (ps_) + ps_->Release(); + if (gs_) + gs_->Release(); + if (ubo) + ubo->Release(); + } + bool RequiresBuffer() { return true; } + GX2PrimitiveMode prim_; + GX2VertexShaderModule *vs_ = nullptr; + GX2PixelShaderModule *ps_ = nullptr; + GX2GeometryShaderModule *gs_ = nullptr; + GX2InputLayout *inputLayout_; + GX2DepthStencilState *depthStencil_; + GX2BlendState *blend_; + GX2RasterState *raster_; + GX2Buffer *ubo = nullptr; +}; + +class GX2TextureObject : public Texture { +public: + GX2TextureObject(const TextureDesc &desc) { + _assert_(desc.initData.size()); + _assert_(desc.initData[0]); + + tex.surface.width = desc.width; + tex.surface.height = desc.height; + tex.surface.depth = 1; + tex.surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + tex.surface.tileMode = GX2_TILE_MODE_LINEAR_ALIGNED; + tex.surface.use = GX2_SURFACE_USE_TEXTURE; + tex.viewNumSlices = 1; + + tex.surface.format = dataFormatToGX2SurfaceFormat(desc.format); + tex.compMap = dataFormatToGX2SurfaceCompSelect(desc.format); + + GX2CalcSurfaceSizeAndAlignment(&tex.surface); + GX2InitTextureRegs(&tex); + width_ = tex.surface.width; + height_ = tex.surface.height; + depth_ = tex.surface.depth; + + tex.surface.image = MEM2_alloc(tex.surface.imageSize, tex.surface.alignment); + _assert_(tex.surface.image); + memset(tex.surface.image, 0xFF, tex.surface.imageSize); + if (desc.initDataCallback) { + desc.initDataCallback((u8 *)tex.surface.image, desc.initData[0], width_, height_, depth_, tex.surface.pitch * DataFormatSizeInBytes(desc.format), 1); + } else { + const u8 *src = desc.initData[0]; + u8 *dst = (u8 *)tex.surface.image; + for (int i = 0; i < desc.height; i++) { + memcpy(dst, src, desc.width * DataFormatSizeInBytes(desc.format)); + dst += tex.surface.pitch * DataFormatSizeInBytes(desc.format); + src += desc.width * DataFormatSizeInBytes(desc.format); + } + } +#if 0 + DEBUG_STR(desc.tag); + DEBUG_VAR(desc.format); + DEBUG_VAR(desc.width); + DEBUG_VAR(desc.height); + DEBUG_VAR(desc.type); + DEBUG_VAR(tex.compMap); + DEBUG_VAR(tex.surface.dim); + DEBUG_VAR(tex.surface.format); +#endif + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_TEXTURE, tex.surface.image, tex.surface.imageSize); + } + ~GX2TextureObject() { MEM2_free(tex.surface.image); } + GX2Texture tex = {}; +}; + +class GX2Framebuffer : public Framebuffer { +public: + GX2Framebuffer(const FramebufferDesc &desc) { + _assert_(desc.numColorAttachments == 1); + _assert_(desc.depth == 1); + tag = desc.tag; + colorBuffer.surface.width = desc.width; + colorBuffer.surface.height = desc.height; + colorBuffer.surface.depth = 1; + colorBuffer.surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + colorBuffer.surface.tileMode = GX2_TILE_MODE_DEFAULT; + colorBuffer.surface.use = (GX2SurfaceUse)(GX2_SURFACE_USE_COLOR_BUFFER | GX2_SURFACE_USE_TEXTURE); + colorBuffer.viewNumSlices = 1; + switch (desc.colorDepth) { + case FBO_565: colorBuffer.surface.format = GX2_SURFACE_FORMAT_UNORM_R5_G6_B5; break; + case FBO_4444: colorBuffer.surface.format = GX2_SURFACE_FORMAT_UNORM_R4_G4_B4_A4; break; + case FBO_5551: colorBuffer.surface.format = GX2_SURFACE_FORMAT_UNORM_R5_G5_B5_A1; break; + default: + case FBO_8888: colorBuffer.surface.format = GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8; break; + } + GX2CalcSurfaceSizeAndAlignment(&colorBuffer.surface); + GX2InitColorBufferRegs(&colorBuffer); + + colorBuffer.surface.image = MEM1_alloc(colorBuffer.surface.imageSize, colorBuffer.surface.alignment); + if(colorBuffer.surface.image) + colorBuffer.surface.use = (GX2SurfaceUse)(colorBuffer.surface.use | GX2R_RESOURCE_USAGE_FORCE_MEM1); + else + { + colorBuffer.surface.image = MEM2_alloc(colorBuffer.surface.imageSize, colorBuffer.surface.alignment); + _assert_(colorBuffer.surface.image); + colorBuffer.surface.use = (GX2SurfaceUse)(colorBuffer.surface.use | GX2R_RESOURCE_USAGE_FORCE_MEM2); + } + GX2Invalidate(GX2_INVALIDATE_MODE_COLOR_BUFFER, colorBuffer.surface.image, colorBuffer.surface.imageSize); + + colorTexture.surface = colorBuffer.surface; + colorTexture.compMap = desc.colorDepth == FBO_565 ? GX2_COMP_SEL(_r, _g, _b, _1) : GX2_COMP_SEL(_r, _g, _b, _a); + colorTexture.viewNumSlices = 1; + GX2InitTextureRegs(&colorTexture); + + if (desc.z_stencil) { + depthBuffer.surface.width = desc.width; + depthBuffer.surface.height = desc.height; + depthBuffer.surface.depth = 1; + depthBuffer.surface.dim = GX2_SURFACE_DIM_TEXTURE_2D; + depthBuffer.surface.tileMode = GX2_TILE_MODE_DEFAULT; + depthBuffer.surface.use = (GX2SurfaceUse)(GX2_SURFACE_USE_DEPTH_BUFFER | GX2_SURFACE_USE_TEXTURE); + depthBuffer.viewNumSlices = 1; + depthBuffer.surface.format = GX2_SURFACE_FORMAT_UNORM_D24_S8; + GX2CalcSurfaceSizeAndAlignment(&depthBuffer.surface); + GX2InitDepthBufferRegs(&depthBuffer); + + depthBuffer.surface.image = MEM1_alloc(depthBuffer.surface.imageSize, depthBuffer.surface.alignment); + if(depthBuffer.surface.image) + depthBuffer.surface.use = (GX2SurfaceUse)(depthBuffer.surface.use | GX2R_RESOURCE_USAGE_FORCE_MEM1); + else + { + depthBuffer.surface.image = MEM2_alloc(depthBuffer.surface.imageSize, depthBuffer.surface.alignment); + _assert_(depthBuffer.surface.image); + depthBuffer.surface.use = (GX2SurfaceUse)(depthBuffer.surface.use | GX2R_RESOURCE_USAGE_FORCE_MEM2); + } + GX2Invalidate(GX2_INVALIDATE_MODE_DEPTH_BUFFER, depthBuffer.surface.image, depthBuffer.surface.imageSize); + + depthTexture.surface = depthBuffer.surface; + depthTexture.compMap = GX2_COMP_SEL(_x, _y, _0, _0); + depthTexture.viewNumSlices = 1; + GX2InitTextureRegs(&depthTexture); + } +#if 0 + DEBUG_STR(desc.tag); + DEBUG_VAR2(desc.width); + DEBUG_VAR2(desc.height); + DEBUG_VAR2(desc.colorDepth); + DEBUG_VAR2(colorBuffer.surface.imageSize); + DEBUG_VAR2(depthBuffer.surface.imageSize); + DEBUG_VAR2(MEM1_avail()); + DEBUG_VAR2(MEM2_avail()); +#endif + + } + ~GX2Framebuffer() { + if (colorBuffer.surface.use & GX2R_RESOURCE_USAGE_FORCE_MEM1) + MEM1_free(colorBuffer.surface.image); + else + MEM2_free(colorBuffer.surface.image); + + if (depthBuffer.surface.use & GX2R_RESOURCE_USAGE_FORCE_MEM1) + MEM1_free(depthBuffer.surface.image); + else + MEM2_free(depthBuffer.surface.image); +#if 0 + DEBUG_STR(tag.c_str()); + DEBUG_VAR2(MEM1_avail()); + DEBUG_VAR2(MEM2_avail()); +#endif + } + GX2ColorBuffer colorBuffer = {}; + GX2DepthBuffer depthBuffer = {}; + GX2Texture colorTexture = {}; + GX2Texture depthTexture = {}; + std::string tag; +}; + +static GX2VertexShaderModule vsCol(&GX2_vsCol); +static GX2VertexShaderModule vsTexCol(&GX2_vsTexCol); +static GX2PixelShaderModule fsCol(&GX2_fsCol); +static GX2PixelShaderModule fsTexCol(&GX2_fsTexCol); +static GX2PixelShaderModule fsTexCol_sw(&GX2_fsTexCol_sw); + +class GX2DrawContext : public DrawContext { +public: + GX2DrawContext(GX2ContextState *context_state, GX2ColorBuffer *color_buffer, GX2DepthBuffer *depth_buffer); + ~GX2DrawContext(); + bool CreatePresets() override { + vsPresets_[VS_TEXTURE_COLOR_2D] = &vsTexCol; + vsPresets_[VS_TEXTURE_COLOR_2D]->AddRef(); + vsPresets_[VS_COLOR_2D] = &vsCol; + vsPresets_[VS_COLOR_2D]->AddRef(); + + fsPresets_[FS_TEXTURE_COLOR_2D] = &fsTexCol; + fsPresets_[FS_TEXTURE_COLOR_2D]->AddRef(); + fsPresets_[FS_TEXTURE_COLOR_2D_RB_SWIZZLE] = &fsTexCol_sw; + fsPresets_[FS_TEXTURE_COLOR_2D_RB_SWIZZLE]->AddRef(); + fsPresets_[FS_COLOR_2D] = &fsCol; + fsPresets_[FS_COLOR_2D]->AddRef(); + + static_assert(VS_MAX_PRESET == 2 && FS_MAX_PRESET == 3, ""); + return true; + } + + const DeviceCaps &GetDeviceCaps() const override { return caps_; } + uint32_t GetSupportedShaderLanguages() const override { return 0; } + uint32_t GetDataFormatSupport(DataFormat fmt) const override; + + InputLayout *CreateInputLayout(const InputLayoutDesc &desc) override { return new GX2InputLayout(desc); } + DepthStencilState *CreateDepthStencilState(const DepthStencilStateDesc &desc) override { return new GX2DepthStencilState(desc); } + BlendState *CreateBlendState(const BlendStateDesc &desc) override { return new GX2BlendState(desc); } + SamplerState *CreateSamplerState(const SamplerStateDesc &desc) override { return new GX2SamplerState(desc); } + RasterState *CreateRasterState(const RasterStateDesc &desc) override { return new GX2RasterState(desc); } + Buffer *CreateBuffer(size_t size, uint32_t usageFlags) override { return new GX2Buffer(size, usageFlags); } + Pipeline *CreateGraphicsPipeline(const PipelineDesc &desc) override { return new GX2Pipeline(desc); } + Texture *CreateTexture(const TextureDesc &desc) override { return new GX2TextureObject(desc); } + ShaderModule *CreateShaderModule(ShaderStage stage, ShaderLanguage language, const u8 *data, size_t dataSize, const std::string &tag) override { + ERROR_LOG(G3D, "missing shader for %s: ", tag.c_str()); + Crash(); + return nullptr; + } + Framebuffer *CreateFramebuffer(const FramebufferDesc &desc) override { return new GX2Framebuffer(desc); } + + void UpdateBuffer(Buffer *buffer, const u8 *data, size_t offset, size_t size, UpdateBufferFlags flags) override; + + void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override; + bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override; + bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override; + + // These functions should be self explanatory. + void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override; + // color must be 0, for now. + void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) override; + + uintptr_t GetFramebufferAPITexture(Framebuffer *fbo, int channelBit, int attachment) override; + + void GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) override; + + void BindTextures(int start, int count, Texture **textures) override; + void BindSamplerStates(int start, int count, SamplerState **states) override; + void BindVertexBuffers(int start, int count, Buffer **buffers, int *offsets) override; + void BindIndexBuffer(Buffer *indexBuffer, int offset) override; + void BindPipeline(Pipeline *pipeline) override; + + void UpdateDynamicUniformBuffer(const void *ub, size_t size) override; + + // Raster state + void SetScissorRect(int left, int top, int width, int height) override { GX2SetScissor(left, top, width, height); } + void SetViewports(int count, Viewport *viewports) override { + assert(count == 1); + GX2SetViewport(viewports->TopLeftX, viewports->TopLeftY, viewports->Width, viewports->Height, viewports->MinDepth, viewports->MaxDepth); + // needed to prevent overwriting memory outside the rendertarget; + // TODO: check and set this during draw calls instead. + GX2SetScissor(0, 0, current_color_buffer_->surface.width, current_color_buffer_->surface.height); + } + void SetBlendFactor(float color[4]) override { + DEBUG_LINE(); + GX2SetBlendConstantColorReg((GX2BlendConstantColorReg *)color); + } + void SetStencilRef(uint8_t ref) override { /*TODO*/ } + + void InvalidateCachedState() override { + if (pipeline_) + pipeline_->Release(); + pipeline_ = nullptr; + indexBuffer_ = nullptr; + GX2SetContextState(context_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); + } + + void Draw(int vertexCount, int offset) override; + void DrawIndexed(int vertexCount, int offset) override; + void DrawUP(const void *vdata, int vertexCount) override; + void Clear(int mask, uint32_t colorval, float depthVal, int stencilVal); + + void BeginFrame() override; + void EndFrame() override { /*TODO*/ } + + std::string GetInfoString(InfoField info) const override { + switch (info) { + case APIVERSION: return "1"; + case VENDORSTRING: return "AMD"; + case VENDOR: return ""; + case DRIVER: return "-"; + case SHADELANGVERSION: return "AMD R700 microcode"; + case APINAME: return "GX2"; + default: return "?"; + } + } + + uint64_t GetNativeObject(NativeObject obj) override { + switch (obj) { + case NativeObject::CONTEXT: return (uintptr_t)context_state_; + case NativeObject::BACKBUFFER_COLOR_VIEW: return current_color_buffer_? (uintptr_t)current_color_buffer_ : (uintptr_t)color_buffer_; + case NativeObject::CONTEXT_EX: + case NativeObject::DEVICE: + case NativeObject::DEVICE_EX: + case NativeObject::BACKBUFFER_DEPTH_VIEW: + case NativeObject::BACKBUFFER_COLOR_TEX: + case NativeObject::BACKBUFFER_DEPTH_TEX: + case NativeObject::FEATURE_LEVEL: + case NativeObject::COMPATIBLE_RENDERPASS: + case NativeObject::BACKBUFFER_RENDERPASS: + case NativeObject::FRAMEBUFFER_RENDERPASS: + case NativeObject::INIT_COMMANDBUFFER: + case NativeObject::BOUND_TEXTURE0_IMAGEVIEW: + case NativeObject::BOUND_TEXTURE1_IMAGEVIEW: + case NativeObject::RENDER_MANAGER: + default: + DEBUG_VAR(obj); + Crash(); + return 0; + } + } + + void HandleEvent(Event ev, int width, int height, void *param1, void *param2) override; + + virtual int GetCurrentStepId() const override { return 0; /*TODO*/ } + +private: + void ApplyCurrentState(); + + DeviceCaps caps_ = {}; + GX2ContextState *context_state_; + GX2ColorBuffer *color_buffer_; + GX2DepthBuffer *depth_buffer_; + GX2ColorBuffer *current_color_buffer_; + GX2DepthBuffer *current_depth_buffer_; + GX2Pipeline *pipeline_ = nullptr; + void *indexBuffer_ = nullptr; +}; + +GX2DrawContext::GX2DrawContext(GX2ContextState *context_state, GX2ColorBuffer *color_buffer, GX2DepthBuffer *depth_buffer) : context_state_(context_state), color_buffer_(color_buffer), depth_buffer_(depth_buffer), current_color_buffer_(color_buffer), current_depth_buffer_(depth_buffer) { + caps_.vendor = GPUVendor::VENDOR_AMD; + // caps_.anisoSupported = true; + caps_.depthRangeMinusOneToOne = false; + caps_.geometryShaderSupported = false; // for now + caps_.tesselationShaderSupported = false; + caps_.multiViewport = true; + caps_.dualSourceBlend = true; + caps_.logicOpSupported = true; + caps_.framebufferCopySupported = true; + caps_.framebufferBlitSupported = true; + caps_.framebufferDepthCopySupported = true; + caps_.framebufferDepthBlitSupported = true; +} + +GX2DrawContext::~GX2DrawContext() { BindPipeline(nullptr); } + +void GX2DrawContext::HandleEvent(Event ev, int width, int height, void *param1, void *param2) { + DEBUG_LINE(); + switch (ev) { + case Event::LOST_BACKBUFFER: { + break; + } + case Event::GOT_BACKBUFFER: { + break; + } + case Event::PRESENTED: break; + } +} + +void GX2DrawContext::UpdateDynamicUniformBuffer(const void *ub, size_t size) { + if (!pipeline_) { + ERROR_LOG(G3D, "GX2DrawContext::UpdateDynamicUniformBuffer called without an active pipeline."); + } + if (!pipeline_->ubo || pipeline_->ubo->size_ < size) { + Crash(); + } + u32 *src = (u32 *)ub; + u32 *dst = (u32 *)pipeline_->ubo->data_; + int count = size >> 2; + while (count--) { + *dst++ = __builtin_bswap32(*src++); + } + GX2Invalidate(GX2_INVALIDATE_MODE_CPU_UNIFORM_BLOCK, pipeline_->ubo->data_, size); +} + +void GX2DrawContext::BindPipeline(Pipeline *pipeline) { + if (pipeline_) + pipeline_->Release(); + pipeline_ = (GX2Pipeline *)pipeline; + if (pipeline_ && pipeline_->vs_ && pipeline_->inputLayout_) { + pipeline_->AddRef(); + GX2SetFetchShader(&pipeline_->inputLayout_->fs); + GX2SetVertexShader(pipeline_->vs_->shader_); + if (pipeline_->ps_) + GX2SetPixelShader(pipeline_->ps_->shader_); + if (pipeline_->gs_) { + GX2SetShaderMode(GX2_SHADER_MODE_GEOMETRY_SHADER); + GX2SetGeometryShader(pipeline_->gs_->shader_); + } else { + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); + } + GX2SetBlendControlReg(&pipeline_->blend_->reg); + GX2SetColorControlReg(&pipeline_->blend_->color_reg); + GX2SetTargetChannelMasksReg(&pipeline_->blend_->mask_reg); + GX2SetDepthStencilControlReg(&pipeline_->depthStencil_->reg_); + GX2SetCullOnlyControl(pipeline_->raster_->frontFace_, pipeline_->raster_->cullFront_, pipeline_->raster_->cullBack_); + if (pipeline_->ubo) { + GX2SetVertexUniformBlock(1, pipeline_->ubo->size_, pipeline_->ubo->data_); + GX2SetVertexUniformBlock(0, pipeline_->ubo->size_, pipeline_->ubo->data_); + // GX2SetPixelUniformBlock(0, pipeline_->ubo->size_, pipeline_->ubo->data_); + // GX2SetGeometryUniformBlock(0, pipeline_->ubo->size_, pipeline_->ubo->data_); + } + } +} + +void GX2DrawContext::ApplyCurrentState() { DEBUG_LINE(); } + +void GX2DrawContext::UpdateBuffer(Buffer *buffer_, const u8 *data, size_t offset, size_t size, UpdateBufferFlags flags) { + GX2Buffer *buffer = (GX2Buffer *)buffer_; + if (buffer->needswap && !(offset & 0x3) && !(size & 0x3)) { + u32 *src = (u32 *)data; + u32 *dst = (u32 *)(buffer->data_ + offset); + int count = size >> 2; + while (count--) + *dst++ = __builtin_bswap32(*src++); + } else { + memcpy(buffer->data_ + offset, data, size); + } + GX2Invalidate(buffer->invMode_, buffer->data_ + offset, size); +} + +void GX2DrawContext::BindVertexBuffers(int start, int count, Buffer **buffers, int *offsets) { + if (!pipeline_) { + ERROR_LOG(G3D, "GX2DrawContext::BindVertexBuffers called without an active pipeline."); + } + if (pipeline_->inputLayout_->strides.size() > start + count) { + ERROR_LOG(G3D, "GX2DrawContext::BindVertexBuffers called invalid start + count."); + return; + } + + for (int i = start; i < start + count; i++) { + GX2Buffer *vbo = (GX2Buffer *)buffers[i]; + u8 *data = vbo->data_; + size_t size = vbo->size_; + int stride = pipeline_->inputLayout_->strides[i]; + + if (offsets && offsets[i] < size) { + data += offsets[i]; + size -= offsets[i]; + } + GX2SetAttribBuffer(i, size, stride, data); + } +} + +void GX2DrawContext::BindIndexBuffer(Buffer *indexBuffer, int offset) { + if (!indexBuffer) { + return; + } + indexBuffer_ = ((GX2Buffer *)indexBuffer)->data_ + offset; +} + +void GX2DrawContext::Draw(int vertexCount, int offset) { + if (!pipeline_) { + ERROR_LOG(G3D, "GX2DrawContext::Draw called without an active pipeline."); + return; + } +#if 0 + struct Vertex { + float x, y, z; + float u, v; + uint32_t rgba; + }; +#define col 0xFFFFFFFF + __attribute__((aligned(GX2_VERTEX_BUFFER_ALIGNMENT))) static Vertex v[4] = { + { 0, 0, 0, 0, 0, col }, + { 840, 0, 0, 1, 0, col }, + { 840, 480, 0, 1, 1, col }, + { 0, 480, 0, 0, 1, col } + }; + GX2SetAttribBuffer(0, sizeof(v), sizeof(*v), v); + GX2DrawEx(GX2_PRIMITIVE_MODE_QUADS, 4, 0, 1); +#else + + GX2DrawEx(pipeline_->prim_, vertexCount, offset, 1); + // TODO: get rid of this call, which is currently needed to prevent overwriting arribute memory during draw + PROFILE_THIS_SCOPE("GX2DrawDone"); + GX2DrawDone(); +#endif +} + +void GX2DrawContext::DrawIndexed(int indexCount, int offset) { + if (!pipeline_) { + ERROR_LOG(G3D, "GX2DrawContext::DrawIndexed called without an active pipeline."); + return; + } + if (!indexBuffer_) { + ERROR_LOG(G3D, "GX2DrawContext::DrawIndexed called without an active index buffer."); + return; + } + if (!indexCount) + return; + + GX2DrawIndexedImmediateEx(pipeline_->prim_, indexCount, GX2_INDEX_TYPE_U16, indexBuffer_, offset, 1); +} + +void GX2DrawContext::DrawUP(const void *vdata, int vertexCount) { DEBUG_LINE(); } + +uint32_t GX2DrawContext::GetDataFormatSupport(DataFormat fmt) const { + GX2AttribFormat afmt = dataFormatToGX2AttribFormat(fmt); + GX2SurfaceFormat sfmt = dataFormatToGX2SurfaceFormat(fmt); + uint32_t support = 0; + + if (afmt != (GX2AttribFormat)-1) + support |= FMT_INPUTLAYOUT; + + if (sfmt != GX2_SURFACE_FORMAT_INVALID) { + if (DataFormatIsDepthStencil(fmt)) { + support |= FMT_DEPTHSTENCIL; + if (sfmt != GX2_SURFACE_FORMAT_FLOAT_D24_S8) { + support |= FMT_TEXTURE; + } + } else { + support |= FMT_TEXTURE | FMT_RENDERTARGET; + } + // support |= FMT_AUTOGEN_MIPS; + } + + return support; +} + +void GX2DrawContext::BindTextures(int start, int count, Texture **textures) { + // GX2DrawDone(); + while (count--) { + GX2TextureObject *texture = (GX2TextureObject *)*textures++; + if (texture && texture->tex.surface.image) { + GX2SetPixelTexture(&texture->tex, start); + } + start++; + } +} + +void GX2DrawContext::BindSamplerStates(int start, int count, SamplerState **states) { + while (count--) { + GX2SamplerState *samplerState = (GX2SamplerState *)*states++; + if (samplerState) + GX2SetPixelSampler(&samplerState->sampler_, start++); + } +} + +void GX2DrawContext::Clear(int mask, uint32_t colorval, float depthVal, int stencilVal) { + float f[4]; + Uint8x4ToFloat4(f, colorval); + + // GX2DrawDone(); + int flags = (mask >> 1) & 0x3; + + if (flags && (mask & FBChannel::FB_COLOR_BIT)) { + GX2ClearBuffersEx(current_color_buffer_, current_depth_buffer_, f[0], f[1], f[2], f[3], depthVal, stencilVal, (GX2ClearFlags)flags); + } else if (mask & FBChannel::FB_COLOR_BIT) { + GX2ClearColor(current_color_buffer_, f[0], f[1], f[2], f[3]); + } else if (flags) { + GX2ClearDepthStencilEx(current_depth_buffer_, depthVal, stencilVal, (GX2ClearFlags)flags); + } + + GX2SetContextState(context_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); +} + +void GX2DrawContext::BeginFrame() {} + +void GX2DrawContext::CopyFramebufferImage(Framebuffer *srcfb, int level, int x, int y, int z, Framebuffer *dstfb, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBit, const char *tag) { + _assert_(level == 0 && dstLevel == 0 && z == 0 && dstZ == 0 && depth == 1); + GX2Rect srcRegion = { x, y, x + width, y + height }; + GX2Point dstCoords = { dstX, dstY }; + GX2Surface *srcSurface, *dstSurface; + if (channelBit == Draw::FB_COLOR_BIT) { + srcSurface = &((GX2Framebuffer *)srcfb)->colorBuffer.surface; + dstSurface = &((GX2Framebuffer *)dstfb)->colorBuffer.surface; + } else { + srcSurface = &((GX2Framebuffer *)srcfb)->depthBuffer.surface; + dstSurface = &((GX2Framebuffer *)dstfb)->depthBuffer.surface; + } + GX2CopySurfaceEx(srcSurface, level, z, dstSurface, dstLevel, dstZ, 1, &srcRegion, &dstCoords); + GX2SetContextState(context_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); +} + +bool GX2DrawContext::BlitFramebuffer(Framebuffer *srcfb, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dstfb, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) { + // TODO + DEBUG_LINE(); +// Crash(); + return false; +} + +bool GX2DrawContext::CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int bx, int by, int bw, int bh, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) { + _assert_(channelBits == FB_COLOR_BIT); + PROFILE_THIS_SCOPE("fbcpy_sync"); + GX2Framebuffer *fb = (GX2Framebuffer *)src; + GX2DrawDone(); + + GX2Surface *surface; + if (channelBits == FB_COLOR_BIT) { + surface = fb ? &fb->colorBuffer.surface : &color_buffer_->surface; + GX2Invalidate(GX2_INVALIDATE_MODE_COLOR_BUFFER, surface->image, surface->imageSize); + _assert_(surface->format == GX2_SURFACE_FORMAT_UNORM_R8_G8_B8_A8); + } else { + surface = fb ? &fb->depthBuffer.surface : &depth_buffer_->surface; + GX2Invalidate(GX2_INVALIDATE_MODE_DEPTH_BUFFER, surface->image, surface->imageSize); + } + + if (bx >= surface->width || by >= surface->height) + return true; + + // TODO: Figure out where the badness really comes from. + if (bx + bw > surface->width) { + bw = surface->width - bx; + } + + if (by + bh > surface->height) { + bh = surface->height - by; + } + + switch (channelBits) { + case FB_COLOR_BIT: { + // Pixel size always 4 here because we always request RGBA8888. + const u32 *src = nullptr; + u32 handle; + GX2AllocateTilingApertureEx(surface, 0, 0, GX2_ENDIAN_SWAP_NONE, &handle, (void **)&src); + src += by * surface->pitch + bx; + ConvertFromRGBA8888((u8 *)pixels, (u8 *)src, pixelStride, surface->pitch, bw, bh, format); + GX2FreeTilingAperture(handle); + break; + } + case FB_DEPTH_BIT: + Crash(); // TODO + for (int y = by; y < by + bh; y++) { + float *dest = (float *)((u8 *)pixels + y * pixelStride * sizeof(float)); + const u32 *src = (u32 *)surface->image + by * surface->pitch + bx; + for (int x = 0; x < bw; x++) { + dest[x] = (src[x] & 0xFFFFFF) / (256.f * 256.f * 256.f); + } + } + break; + case FB_STENCIL_BIT: + Crash(); // TODO + for (int y = by; y < by + bh; y++) { + u8 *destStencil = (u8 *)pixels + y * pixelStride; + const u32 *src = (u32 *)surface->image + by * surface->pitch + bx; + for (int x = 0; x < bw; x++) { + destStencil[x] = src[x] >> 24; + } + } + break; + } + + return true; +} + +void GX2DrawContext::BindFramebufferAsRenderTarget(Framebuffer *fbo_, const RenderPassInfo &rp, const char *tag) { + GX2Framebuffer *fbo = (GX2Framebuffer *)fbo_; + + // GX2DrawDone(); + if (fbo) { + current_color_buffer_ = &fbo->colorBuffer; + current_depth_buffer_ = &fbo->depthBuffer; + } else { + current_color_buffer_ = color_buffer_; + current_depth_buffer_ = depth_buffer_; + } + + GX2SetColorBuffer(current_color_buffer_, GX2_RENDER_TARGET_0); + GX2SetDepthBuffer(current_depth_buffer_); + GX2SetScissor(0, 0, current_color_buffer_->surface.width, current_color_buffer_->surface.height); + float f[4]; + Uint8x4ToFloat4(f, rp.clearColor); + int flags = 0; + if (rp.depth == RPAction::CLEAR) + flags |= (int)GX2_CLEAR_FLAGS_DEPTH; + if (rp.stencil == RPAction::CLEAR) + flags |= (int)GX2_CLEAR_FLAGS_STENCIL; + + if ((rp.color == RPAction::CLEAR) && flags) { + GX2ClearBuffersEx(current_color_buffer_, current_depth_buffer_, f[0], f[1], f[2], f[3], rp.clearDepth, rp.clearStencil, (GX2ClearFlags)flags); + } else if (rp.color == RPAction::CLEAR) { + GX2ClearColor(current_color_buffer_, f[0], f[1], f[2], f[3]); + } else if (flags) { + GX2ClearDepthStencilEx(current_depth_buffer_, rp.clearDepth, rp.clearStencil, (GX2ClearFlags)flags); + } + GX2SetContextState(context_state_); + GX2SetShaderMode(GX2_SHADER_MODE_UNIFORM_BLOCK); +} + +void GX2DrawContext::BindFramebufferAsTexture(Framebuffer *fbo_, int binding, FBChannel channelBit, int attachment) { + GX2Framebuffer *fbo = (GX2Framebuffer *)fbo_; + _assert_(channelBit == FB_COLOR_BIT); + + // GX2DrawDone(); + if (channelBit == FB_COLOR_BIT) { + GX2SetPixelTexture(&fbo->colorTexture, binding); + } +} + +uintptr_t GX2DrawContext::GetFramebufferAPITexture(Framebuffer *fbo_, int channelBit, int attachment) { + GX2Framebuffer *fbo = (GX2Framebuffer *)fbo_; + _assert_(channelBit == FB_COLOR_BIT); + + // GX2DrawDone(); + if (channelBit == FB_COLOR_BIT) { + return (uintptr_t)&fbo->colorTexture; + } + return 0; +} + +void GX2DrawContext::GetFramebufferDimensions(Framebuffer *fbo_, int *w, int *h) { + GX2Framebuffer *fbo = (GX2Framebuffer *)fbo_; + if (fbo) { + *w = fbo->colorBuffer.surface.width; + *h = fbo->colorBuffer.surface.height; + } else { + *w = color_buffer_->surface.width; + *h = color_buffer_->surface.height; + } +} + + +DrawContext *T3DCreateGX2Context(GX2ContextState *context_state, GX2ColorBuffer *color_buffer, GX2DepthBuffer *depth_buffer) { return new GX2DrawContext(context_state, color_buffer, depth_buffer); } + +} // namespace Draw diff --git a/ext/native/thread/threadutil.cpp b/ext/native/thread/threadutil.cpp index a47286b96f15..b6c4aea55929 100644 --- a/ext/native/thread/threadutil.cpp +++ b/ext/native/thread/threadutil.cpp @@ -19,6 +19,10 @@ #include #endif +#ifdef __wiiu__ +#include +#endif + #ifdef TLS_SUPPORTED static __THREAD const char *curThreadName; #endif @@ -105,6 +109,8 @@ void setCurrentThreadName(const char* threadName) { pthread_setname_np(threadName); // #else // pthread_setname_np(threadName); +#elif defined(__wiiu__) + OSSetThreadName(OSGetCurrentThread(), threadName); #endif // Do nothing @@ -116,7 +122,10 @@ void setCurrentThreadName(const char* threadName) { } void AssertCurrentThreadName(const char *threadName) { -#ifdef TLS_SUPPORTED +#if defined(TLS_SUPPORTED) || defined(__wiiu__) +#ifdef __wiiu__ + const char *curThreadName = OSGetThreadName(OSGetCurrentThread()); +#endif if (strcmp(curThreadName, threadName) != 0) { ERROR_LOG(SYSTEM, "Thread name assert failed: Expected %s, was %s", threadName, curThreadName); } diff --git a/ext/native/util/random/rng.h b/ext/native/util/random/rng.h index 0a5960fa7028..b3dc89759389 100644 --- a/ext/native/util/random/rng.h +++ b/ext/native/util/random/rng.h @@ -1,6 +1,7 @@ #pragma once #include "base/basictypes.h" +#include "Common/Swap.h" // George Marsaglia-style random number generator. class GMRng { @@ -57,8 +58,8 @@ class MersenneTwister { MT_SIZE = 624, }; - uint32_t index_; - uint32_t mt_[MT_SIZE]; + u32_le index_; + u32_le mt_[MT_SIZE]; void gen() { for(uint32_t i = 0; i < MT_SIZE; i++){ diff --git a/ext/wiiu b/ext/wiiu new file mode 160000 index 000000000000..1f2d8bc2f8ff --- /dev/null +++ b/ext/wiiu @@ -0,0 +1 @@ +Subproject commit 1f2d8bc2f8fffca6336bd64870a9f42421881284 diff --git a/ffmpeg b/ffmpeg index 55147e5f33f5..72111e54ad73 160000 --- a/ffmpeg +++ b/ffmpeg @@ -1 +1 @@ -Subproject commit 55147e5f33f5ae4904f75ec082af809267122b94 +Subproject commit 72111e54ad73f877b1d5a62b6d970a0fe212454a diff --git a/ppsspp_config.h b/ppsspp_config.h index 655e744f144b..423ed928211b 100644 --- a/ppsspp_config.h +++ b/ppsspp_config.h @@ -72,6 +72,10 @@ #endif #endif +#if defined(__PPC__) + // we can't use PPC here since it could be pre-defined by the compiler + #define PPSSPP_ARCH_POWERPC 1 +#endif // PLATFORM defines #if defined(_WIN32) @@ -111,12 +115,17 @@ #elif defined(__ANDROID__) #define PPSSPP_PLATFORM_ANDROID 1 #define PPSSPP_PLATFORM_LINUX 1 +#elif defined(__wiiu__) + #define PPSSPP_ARCH_32BIT 1 + #define PPSSPP_ARCH_PPC750 1 + #define PPSSPP_PLATFORM_WIIU 1 + #define PPSSPP_API_GX2 1 #elif defined(__linux__) #define PPSSPP_PLATFORM_LINUX 1 #endif -// Windows ARM/ARM64, and Windows UWP (all), are the only platform that don't do GL at all (until Apple finally removes it) -#if !PPSSPP_PLATFORM(WINDOWS) || ((!PPSSPP_ARCH(ARM) && !PPSSPP_ARCH(ARM64)) && !PPSSPP_PLATFORM(UWP)) +// Windows ARM/ARM64, Windows UWP (all), and WiiU are the only platforms that don't do GL at all (until Apple finally removes it) +#if (!PPSSPP_PLATFORM(WINDOWS) || ((!PPSSPP_ARCH(ARM) && !PPSSPP_ARCH(ARM64)) && !PPSSPP_PLATFORM(UWP))) && !PPSSPP_PLATFORM(WIIU) #define PPSSPP_API_ANY_GL 1 #endif diff --git a/unittest/JitHarness.cpp b/unittest/JitHarness.cpp index e2cecf0f88da..5d33fc030e14 100644 --- a/unittest/JitHarness.cpp +++ b/unittest/JitHarness.cpp @@ -177,6 +177,8 @@ bool TestJit() { std::vector lines = DisassembleArm2(block->normalEntry, block->codeSize); #elif PPSSPP_ARCH(ARM64) std::vector lines = DisassembleArm64(block->normalEntry, block->codeSize); +#elif defined(__wiiu__) + std::vector lines = DisassemblePPC(block->normalEntry, block->codeSize); #else std::vector lines = DisassembleX86(block->normalEntry, block->codeSize); #endif