Skip to content

Commit

Permalink
Merge pull request #17810 from hrydgard/bbox-cache-planes
Browse files Browse the repository at this point in the history
Cache computed planes used for BBOX culling
  • Loading branch information
hrydgard authored Jul 30, 2023
2 parents a28acf2 + f0fd9e8 commit 4c560e4
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 86 deletions.
2 changes: 1 addition & 1 deletion Core/HLE/sceDisplay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ void __DisplayFlip(int cyclesLate) {
#ifndef _DEBUG
auto err = GetI18NCategory(I18NCat::ERRORS);
if (g_Config.bSoftwareRendering) {
g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: Try turning off Software Rendering"));
g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: Try turning off Software Rendering"), 5.0f);
} else {
g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: try frameskip, sound is choppy when slow"));
}
Expand Down
125 changes: 64 additions & 61 deletions GPU/Common/DrawEngineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
#include <cfloat>

#include "Common/Data/Convert/ColorConv.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/Profiler/Profiler.h"
#include "Common/LogReporting.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Core/Config.h"
#include "GPU/Common/DrawEngineCommon.h"
#include "GPU/Common/SplineCommon.h"
Expand Down Expand Up @@ -136,21 +136,6 @@ std::string DrawEngineCommon::DebugGetVertexLoaderString(std::string id, DebugSh
return dec ? dec->GetString(stringType) : "N/A";
}

struct Plane {
float x, y, z, w;
void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
};

static void PlanesFromMatrix(const float mtx[16], Plane planes[6]) {
planes[0].Set(mtx[3]-mtx[0], mtx[7]-mtx[4], mtx[11]-mtx[8], mtx[15]-mtx[12]); // Right
planes[1].Set(mtx[3]+mtx[0], mtx[7]+mtx[4], mtx[11]+mtx[8], mtx[15]+mtx[12]); // Left
planes[2].Set(mtx[3]+mtx[1], mtx[7]+mtx[5], mtx[11]+mtx[9], mtx[15]+mtx[13]); // Bottom
planes[3].Set(mtx[3]-mtx[1], mtx[7]-mtx[5], mtx[11]-mtx[9], mtx[15]-mtx[13]); // Top
planes[4].Set(mtx[3]+mtx[2], mtx[7]+mtx[6], mtx[11]+mtx[10], mtx[15]+mtx[14]); // Near
planes[5].Set(mtx[3]-mtx[2], mtx[7]-mtx[6], mtx[11]-mtx[10], mtx[15]-mtx[14]); // Far
}

static Vec3f ClipToScreen(const Vec4f& coords) {
float xScale = gstate.getViewportXScale();
float xCenter = gstate.getViewportXCenter();
Expand Down Expand Up @@ -250,6 +235,52 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex
}
}

// Gated by DIRTY_CULL_PLANES
void DrawEngineCommon::UpdatePlanes() {
float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);

// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
// Note that the PSP does not clip against the viewport.
const Vec2f baseOffset = Vec2f(gstate.getOffsetX(), gstate.getOffsetY());
// Region1 (rate) is used as an X1/Y1 here, matching PSP behavior.
minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);

// Now let's apply the viewport to our scissor/region + offset range.
Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
Vec2f maxViewport = (maxOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;

Lin::Matrix4x4 applyViewport;
applyViewport.empty();
// Scale to the viewport's size.
applyViewport.xx = 2.0f / (maxViewport.x - minViewport.x);
applyViewport.yy = 2.0f / (maxViewport.y - minViewport.y);
applyViewport.zz = 1.0f;
applyViewport.ww = 1.0f;
// And offset to the viewport's centers.
applyViewport.wx = -(maxViewport.x + minViewport.x) / (maxViewport.x - minViewport.x);
applyViewport.wy = -(maxViewport.y + minViewport.y) / (maxViewport.y - minViewport.y);

float mtx[16];
Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m);

planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right
planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left
planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom
planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top
planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near
planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far
}

// This code has plenty of potential for optimization.
//
// It does the simplest and safest test possible: If all points of a bbox is outside a single of
Expand All @@ -273,7 +304,7 @@ bool DrawEngineCommon::TestBoundingBox(const void *control_points, const void *i
verts[i] = vtx[i] * (1.0f / 128.0f);
}
} else if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_16BIT && !inds) {
const s16 *vtx = (const s16*)control_points;
const s16 *vtx = (const s16 *)control_points;
for (int i = 0; i < vertexCount * 3; i++) {
verts[i] = vtx[i] * (1.0f / 32768.0f);
}
Expand Down Expand Up @@ -302,70 +333,42 @@ bool DrawEngineCommon::TestBoundingBox(const void *control_points, const void *i
}
}

Plane planes[6];

float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);

// Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y.
// Note that the PSP does not clip against the viewport.
const Vec2f baseOffset = Vec2f(gstate.getOffsetX(), gstate.getOffsetY());
// Region1 (rate) is used as an X1/Y1 here, matching PSP behavior.
Vec2f minOffset = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f);
Vec2f maxOffset = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f);

// Now let's apply the viewport to our scissor/region + offset range.
Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale());
Vec2f minViewport = (minOffset - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;
Vec2f maxViewport = (maxOffset - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale;

Lin::Matrix4x4 applyViewport;
applyViewport.empty();
// Scale to the viewport's size.
applyViewport.xx = 2.0f / (maxViewport.x - minViewport.x);
applyViewport.yy = 2.0f / (maxViewport.y - minViewport.y);
applyViewport.zz = 1.0f;
applyViewport.ww = 1.0f;
// And offset to the viewport's centers.
applyViewport.wx = -(maxViewport.x + minViewport.x) / (maxViewport.x - minViewport.x);
applyViewport.wy = -(maxViewport.y + minViewport.y) / (maxViewport.y - minViewport.y);

float screenBounds[16];
Matrix4ByMatrix4(screenBounds, worldviewproj, applyViewport.m);
// Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform
// in here as well. Though, it still does cut down on a lot of updates in Tekken 6.
if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) {
UpdatePlanes();
gpuStats.numPlaneUpdates++;
gstate_c.Clean(DIRTY_CULL_PLANES);
}

PlanesFromMatrix(screenBounds, planes);
// Note: near/far are not checked without clamp/clip enabled, so we skip those planes.
int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4;
for (int plane = 0; plane < totalPlanes; plane++) {
int inside = 0;
int out = 0;
for (int i = 0; i < vertexCount; i++) {
// Here we can test against the frustum planes!
float value = planes[plane].Test(verts + i * 3);
// Test against the frustum planes, and count.
// TODO: We should test 4 vertices at a time using SIMD.
// I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6.
float value = planes_[plane].Test(verts + i * 3);
if (value <= -FLT_EPSILON)
out++;
else
inside++;
}

// No vertices inside this one plane? Don't need to draw.
if (inside == 0) {
// All out - but check for X and Y if the offset was near the cullbox edge.
bool outsideEdge = false;
if (plane == 1)
outsideEdge = minOffset.x < 1.0f;
outsideEdge = minOffset_.x < 1.0f;
if (plane == 2)
outsideEdge = minOffset.y < 1.0f;
outsideEdge = minOffset_.y < 1.0f;
else if (plane == 0)
outsideEdge = maxOffset.x >= 4096.0f;
outsideEdge = maxOffset_.x >= 4096.0f;
else if (plane == 3)
outsideEdge = maxOffset.y >= 4096.0f;
outsideEdge = maxOffset_.y >= 4096.0f;

// Only consider this outside if offset + scissor/region is fully inside the cullbox.
if (!outsideEdge)
Expand Down
14 changes: 14 additions & 0 deletions GPU/Common/DrawEngineCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "Common/CommonTypes.h"
#include "Common/Data/Collections/Hashmaps.h"

#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/Common/GPUStateUtils.h"
#include "GPU/Common/GPUDebugInterface.h"
Expand Down Expand Up @@ -68,6 +69,13 @@ class TessellationDataTransfer {
virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0;
};

// Culling plane.
struct Plane {
float x, y, z, w;
void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; }
};

class DrawEngineCommon {
public:
DrawEngineCommon();
Expand Down Expand Up @@ -131,6 +139,7 @@ class DrawEngineCommon {

protected:
virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
void UpdatePlanes();

int ComputeNumVertsToDecode() const;
void DecodeVerts(u8 *dest);
Expand Down Expand Up @@ -236,4 +245,9 @@ class DrawEngineCommon {

// Hardware tessellation
TessellationDataTransfer *tessDataTransfer;

// Culling
Plane planes_[6];
Vec2f minOffset_;
Vec2f maxOffset_;
};
3 changes: 2 additions & 1 deletion GPU/Common/ShaderCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,15 @@ enum : uint64_t {
DIRTY_LIGHT_CONTROL = 1ULL << 38,
DIRTY_TEX_ALPHA_MUL = 1ULL << 39,

// Bits 40-43 are free for new uniforms. Then we're really out and need to start merging.
// Bits 40-42 are free for new uniforms. Then we're really out and need to start merging.
// Don't forget to update DIRTY_ALL_UNIFORMS when you start using them.

DIRTY_BONE_UNIFORMS = 0xFF000000ULL,

DIRTY_ALL_UNIFORMS = 0x0FFFFFFFFFFULL,

// Other dirty elements that aren't uniforms
DIRTY_CULL_PLANES = 1ULL << 43,
DIRTY_FRAMEBUF = 1ULL << 44,
DIRTY_TEXTURE_IMAGE = 1ULL << 45, // Means that the definition of the texture image has changed (address, stride etc), and we need to look up again.
DIRTY_TEXTURE_PARAMS = 1ULL << 46,
Expand Down
9 changes: 5 additions & 4 deletions GPU/GPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ inline unsigned int toFloat24(float f) {
return i >> 8;
}

// The ToString function lives in GPUCommonHW.cpp.
struct GPUStatistics {
void Reset() {
ResetFrame();
Expand All @@ -84,10 +85,10 @@ struct GPUStatistics {
numTextureInvalidations = 0;
numTextureInvalidationsByFramebuffer = 0;
numTexturesHashed = 0;
numTextureSwitches = 0;
numTextureDataBytesHashed = 0;
numShaderSwitches = 0;
numFlushes = 0;
numBBOXJumps = 0;
numPlaneUpdates = 0;
numTexturesDecoded = 0;
numFramebufferEvaluations = 0;
numBlockingReadbacks = 0;
Expand All @@ -114,6 +115,8 @@ struct GPUStatistics {
int numListSyncs;
int numCachedDrawCalls;
int numFlushes;
int numBBOXJumps;
int numPlaneUpdates;
int numVertsSubmitted;
int numCachedVertsDrawn;
int numUncachedVertsDrawn;
Expand All @@ -122,8 +125,6 @@ struct GPUStatistics {
int numTextureInvalidationsByFramebuffer;
int numTexturesHashed;
int numTextureDataBytesHashed;
int numTextureSwitches;
int numShaderSwitches;
int numTexturesDecoded;
int numFramebufferEvaluations;
int numBlockingReadbacks;
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,7 @@ void GPUCommon::Execute_BJump(u32 op, u32 diff) {
if (!currentList->bboxResult) {
// bounding box jump.
const u32 target = gstate_c.getRelativeAddress(op & 0x00FFFFFC);
gpuStats.numBBOXJumps++;
if (Memory::IsValidAddress(target)) {
UpdatePC(currentList->pc, target - 4);
currentList->pc = target - 4; // pc will be increased after we return, counteract that
Expand Down
Loading

0 comments on commit 4c560e4

Please sign in to comment.