Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor VRAM download, optimize rendered CLUTs #8389

Merged
merged 10 commits into from
Jan 5, 2016
97 changes: 90 additions & 7 deletions GPU/Common/FramebufferCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ FramebufferManagerCommon::FramebufferManagerCommon() :
usePostShader_(false),
postShaderAtOutputResolution_(false),
postShaderIsUpscalingFilter_(false),
hackForce04154000Download_(false) {
hackForce04154000Download_(false),
gameUsesSequentialCopies_(false) {
UpdateSize();
}

Expand Down Expand Up @@ -165,21 +166,18 @@ void FramebufferManagerCommon::SetDisplayFramebuffer(u32 framebuf, u32 stride, G
}

VirtualFramebuffer *FramebufferManagerCommon::GetVFBAt(u32 addr) {
VirtualFramebuffer *match = NULL;
VirtualFramebuffer *match = nullptr;
for (size_t i = 0; i < vfbs_.size(); ++i) {
VirtualFramebuffer *v = vfbs_[i];
if (MaskedEqual(v->fb_address, addr)) {
// Could check w too but whatever
if (match == NULL || match->last_frame_render < v->last_frame_render) {
if (match == nullptr || match->last_frame_render < v->last_frame_render) {
match = v;
}
}
}
if (match != NULL) {
return match;
}

return 0;
return match;
}

bool FramebufferManagerCommon::MaskedEqual(u32 addr1, u32 addr2) {
Expand Down Expand Up @@ -730,6 +728,91 @@ void FramebufferManagerCommon::FindTransferFramebuffers(VirtualFramebuffer *&dst
}
}

VirtualFramebuffer *FramebufferManagerCommon::FindDownloadTempBuffer(VirtualFramebuffer *vfb) {
// For now we'll keep these on the same struct as the ones that can get displayed
// (and blatantly copy work already done above while at it).
VirtualFramebuffer *nvfb = 0;

// We maintain a separate vector of framebuffer objects for blitting.
for (size_t i = 0; i < bvfbs_.size(); ++i) {
VirtualFramebuffer *v = bvfbs_[i];
if (v->fb_address == vfb->fb_address && v->format == vfb->format) {
if (v->bufferWidth == vfb->bufferWidth && v->bufferHeight == vfb->bufferHeight) {
nvfb = v;
v->fb_stride = vfb->fb_stride;
v->width = vfb->width;
v->height = vfb->height;
break;
}
}
}

// Create a new fbo if none was found for the size
if (!nvfb) {
nvfb = new VirtualFramebuffer();
nvfb->fbo = nullptr;
nvfb->fb_address = vfb->fb_address;
nvfb->fb_stride = vfb->fb_stride;
nvfb->z_address = vfb->z_address;
nvfb->z_stride = vfb->z_stride;
nvfb->width = vfb->width;
nvfb->height = vfb->height;
nvfb->renderWidth = vfb->bufferWidth;
nvfb->renderHeight = vfb->bufferHeight;
nvfb->bufferWidth = vfb->bufferWidth;
nvfb->bufferHeight = vfb->bufferHeight;
nvfb->format = vfb->format;
nvfb->drawnWidth = vfb->drawnWidth;
nvfb->drawnHeight = vfb->drawnHeight;
nvfb->drawnFormat = vfb->format;
nvfb->colorDepth = vfb->colorDepth;

if (!CreateDownloadTempBuffer(nvfb)) {
delete nvfb;
return nullptr;
}

bvfbs_.push_back(nvfb);
} else {
UpdateDownloadTempBuffer(nvfb);
}

nvfb->usageFlags |= FB_USAGE_RENDERTARGET;
nvfb->last_frame_render = gpuStats.numFlips;
nvfb->dirtyAfterDisplay = true;

return nvfb;
}

void FramebufferManagerCommon::OptimizeDownloadRange(VirtualFramebuffer * vfb, int & x, int & y, int & w, int & h) {
if (gameUsesSequentialCopies_) {
// Ignore the x/y/etc., read the entire thing.
x = 0;
y = 0;
w = vfb->width;
h = vfb->height;
}
if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) {
// Mark it as fully downloaded until next render to it.
vfb->memoryUpdated = true;
} else {
// Let's try to set the flag eventually, if the game copies a lot.
// Some games copy subranges very frequently.
const static int FREQUENT_SEQUENTIAL_COPIES = 3;
static int frameLastCopy = 0;
static u32 bufferLastCopy = 0;
static int copiesThisFrame = 0;
if (frameLastCopy != gpuStats.numFlips || bufferLastCopy != vfb->fb_address) {
frameLastCopy = gpuStats.numFlips;
bufferLastCopy = vfb->fb_address;
copiesThisFrame = 0;
}
if (++copiesThisFrame > FREQUENT_SEQUENTIAL_COPIES) {
gameUsesSequentialCopies_ = true;
}
}
}

bool FramebufferManagerCommon::NotifyBlockTransferBefore(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int width, int height, int bpp, u32 skipDrawReason) {
if (!useBufferedRendering_ || updateVRAM_) {
return false;
Expand Down
9 changes: 9 additions & 0 deletions GPU/Common/FramebufferCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ struct VirtualFramebuffer {
int last_frame_render;
int last_frame_displayed;
int last_frame_clut;
u32 clutUpdatedBytes;
bool memoryUpdated;
bool depthUpdated;

Expand Down Expand Up @@ -168,6 +169,7 @@ class FramebufferManagerCommon {
void NotifyBlockTransferAfter(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int w, int h, int bpp, u32 skipDrawReason);

virtual void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) = 0;
virtual void DownloadFramebufferForClut(u32 fb_address, u32 loadBytes) = 0;
virtual void MakePixelTexture(const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) = 0;
virtual void DrawPixels(VirtualFramebuffer *vfb, int dstX, int dstY, const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) = 0;
virtual void DrawFramebufferToOutput(const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, bool applyPostShader) = 0;
Expand Down Expand Up @@ -244,11 +246,16 @@ class FramebufferManagerCommon {

bool ShouldDownloadFramebuffer(const VirtualFramebuffer *vfb) const;
void FindTransferFramebuffers(VirtualFramebuffer *&dstBuffer, VirtualFramebuffer *&srcBuffer, u32 dstBasePtr, int dstStride, int &dstX, int &dstY, u32 srcBasePtr, int srcStride, int &srcX, int &srcY, int &srcWidth, int &srcHeight, int &dstWidth, int &dstHeight, int bpp) const;
VirtualFramebuffer *FindDownloadTempBuffer(VirtualFramebuffer *vfb);
virtual bool CreateDownloadTempBuffer(VirtualFramebuffer *nvfb) = 0;
virtual void UpdateDownloadTempBuffer(VirtualFramebuffer *nvfb) = 0;
void OptimizeDownloadRange(VirtualFramebuffer *vfb, int &x, int &y, int &w, int &h);

void UpdateFramebufUsage(VirtualFramebuffer *vfb);

void SetColorUpdated(VirtualFramebuffer *dstBuffer, int skipDrawReason) {
dstBuffer->memoryUpdated = false;
dstBuffer->clutUpdatedBytes = 0;
dstBuffer->dirtyAfterDisplay = true;
dstBuffer->drawnWidth = dstBuffer->width;
dstBuffer->drawnHeight = dstBuffer->height;
Expand Down Expand Up @@ -278,9 +285,11 @@ class FramebufferManagerCommon {
bool postShaderIsUpscalingFilter_;

std::vector<VirtualFramebuffer *> vfbs_;
std::vector<VirtualFramebuffer *> bvfbs_; // blitting framebuffers (for download)
std::set<std::pair<u32, u32>> knownFramebufferRAMCopies_;

bool hackForce04154000Download_;
bool gameUsesSequentialCopies_;

// Sampled in BeginFrame for safety.
float renderWidth_;
Expand Down
55 changes: 35 additions & 20 deletions GPU/Common/TextureCacheCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,46 +213,61 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
if (Memory::IsVRAMAddress(clutAddr)) {
// Clear the uncached bit, etc. to match framebuffers.
const u32 clutFramebufAddr = clutAddr & 0x3FFFFFFF;
const u32 clutFramebufEnd = clutFramebufAddr + loadBytes;
static const u32 MAX_CLUT_OFFSET = 4096;

clutRenderOffset_ = MAX_CLUT_OFFSET;
for (size_t i = 0, n = fbCache_.size(); i < n; ++i) {
auto framebuffer = fbCache_[i];
if ((framebuffer->fb_address | 0x04000000) == clutFramebufAddr) {
const u32 fb_address = framebuffer->fb_address | 0x04000000;
const u32 bpp = framebuffer->drawnFormat == GE_FORMAT_8888 ? 4 : 2;
bool match = fb_address + framebuffer->fb_stride * bpp > clutFramebufAddr && fb_address < clutFramebufEnd;
u32 offset = clutFramebufAddr - fb_address;
if (match && offset < clutRenderOffset_) {
framebuffer->last_frame_clut = gpuStats.numFlips;
framebuffer->usageFlags |= FB_USAGE_CLUT;
clutRenderAddress_ = framebuffer->fb_address;
clutRenderOffset_ = offset;
if (offset == 0) {
break;
}
}
}
}

// It's possible for a game to (successfully) access outside valid memory.
u32 bytes = Memory::ValidSize(clutAddr, loadBytes);
if (clutRenderAddress_ != 0xFFFFFFFF && !g_Config.bDisableSlowFramebufEffects) {
gpu->PerformMemoryDownload(clutAddr, bytes);
}

#ifdef _M_SSE
int numBlocks = bytes / 16;
if (bytes == loadBytes) {
const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr);
__m128i *dest = (__m128i *)clutBufRaw_;
for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) {
__m128i data1 = _mm_loadu_si128(source);
__m128i data2 = _mm_loadu_si128(source + 1);
_mm_store_si128(dest, data1);
_mm_store_si128(dest + 1, data2);
DownloadFramebufferForClut(clutRenderAddress_, clutRenderOffset_ + bytes);
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < loadBytes) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
}
} else {
#ifdef _M_SSE
int numBlocks = bytes / 16;
if (bytes == loadBytes) {
const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr);
__m128i *dest = (__m128i *)clutBufRaw_;
for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) {
__m128i data1 = _mm_loadu_si128(source);
__m128i data2 = _mm_loadu_si128(source + 1);
_mm_store_si128(dest, data1);
_mm_store_si128(dest + 1, data2);
}
} else {
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < loadBytes) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
}
}
#else
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < loadBytes) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
}
}
#else
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
if (bytes < clutTotalBytes_) {
memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes);
}
#endif
}
} else {
memset(clutBufRaw_, 0x00, loadBytes);
}
Expand Down
3 changes: 3 additions & 0 deletions GPU/Common/TextureCacheCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ class TextureCacheCommon {
virtual bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) = 0;
virtual void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) = 0;

virtual void DownloadFramebufferForClut(u32 clutAddr, u32 bytes) = 0;

TexCache cache;
std::vector<VirtualFramebuffer *> fbCache_;

Expand All @@ -155,6 +157,7 @@ class TextureCacheCommon {
u32 clutTotalBytes_;
u32 clutMaxBytes_;
u32 clutRenderAddress_;
u32 clutRenderOffset_;
int standardScaleFactor_;
};

Expand Down
Loading