fix VertexDecoder/SoftGpu on big-endian.

hrydgard · May 6, 2018 · e98e9f7 · e98e9f7
1 parent 0188d5c
commit e98e9f7
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 98 deletions.
diff --git a/Common/ColorConv.cpp b/Common/ColorConv.cpp
@@ -322,13 +322,12 @@ void ConvertRGBA565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
 	u32 i = 0;
 #endif
 
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = i; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert5To8((col) & 0x1f);
-		dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
-		dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f);
-		dst[x * 4 + 3] = 255;
+		dst32[x] = Convert5To8((col) & 0x1f);
+		dst32[x] |= Convert6To8((col >> 5) & 0x3f) << 8;
+		dst32[x] |= Convert5To8((col >> 11) & 0x1f) << 16;
+		dst32[x] |= 255 << 24;
 	}
 }
 
@@ -376,13 +375,12 @@ void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
 	u32 i = 0;
 #endif
 
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = i; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert5To8((col) & 0x1f);
-		dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);
-		dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f);
-		dst[x * 4 + 3] = (col >> 15) ? 255 : 0;
+		dst32[x] = Convert5To8((col) & 0x1f);
+		dst32[x] |= Convert5To8((col >> 5) & 0x1f) << 8;
+		dst32[x] |= Convert5To8((col >> 10) & 0x1f) << 16;
+		dst32[x] |= (col >> 15) ? 255  << 24 : 0;
 	}
 }
 
@@ -425,51 +423,46 @@ void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
 	u32 i = 0;
 #endif
 
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = i; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert4To8(col & 0xf);
-		dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);
-		dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf);
-		dst[x * 4 + 3] = Convert4To8(col >> 12);
+		dst32[x] = Convert4To8(col & 0xf);
+		dst32[x] |= Convert4To8((col >> 4) & 0xf) << 8;
+		dst32[x] |= Convert4To8((col >> 8) & 0xf) << 16;
+		dst32[x] |= Convert4To8(col >> 12) << 24;
 	}
 }
 
 void ConvertABGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = 0; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
-		dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
-		dst[x * 4 + 2] = Convert5To8((col) & 0x1f);
-		dst[x * 4 + 3] = 255;
+		dst32[x] = Convert5To8((col >> 11) & 0x1f);
+		dst32[x] |= Convert6To8((col >> 5) & 0x3f) << 8;
+		dst32[x] |= Convert5To8((col) & 0x1f) << 16;
+		dst32[x] |= 255 << 24;
 	}
 }
 
 void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = 0; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
-		dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f);
-		dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f);
-		dst[x * 4 + 3] = (col & 1) ? 255 : 0;
+		dst32[x] = Convert5To8((col >> 11) & 0x1f);
+		dst32[x] |= Convert5To8((col >> 6) & 0x1f) << 8;
+		dst32[x] |= Convert5To8((col >> 1) & 0x1f) << 16;
+		dst32[x] |= (col & 1) ? 255  << 24 : 0;
 	}
 }
 
 void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
-	u8 *dst = (u8 *)dst32;
 	for (u32 x = 0; x < numPixels; x++) {
 		u16 col = src[x];
-		dst[x * 4] = Convert4To8(col >> 12);
-		dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf);
-		dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf);
-		dst[x * 4 + 3] = Convert4To8(col & 0xf);
+		dst32[x] = Convert4To8(col >> 12);
+		dst32[x] |= Convert4To8((col >> 8) & 0xf) << 8;
+		dst32[x] |= Convert4To8((col >> 4) & 0xf) << 16;
+		dst32[x] |= Convert4To8(col & 0xf) << 24;
 	}
 }
 
-void ConvertRGBA4444ToBGRA8888(u32 *dst32, const u16 *src, u32 numPixels) {
-	u8 *dst = (u8 *)dst32;
+void ConvertRGBA4444ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
 	for (u32 x = 0; x < numPixels; x++) {
 		u16 c = src[x];
 		u32 r = c & 0x000f;

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
@@ -117,7 +117,7 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
 				lowerBound = value;
 		}
 	} else if (idx == GE_VTYPE_IDX_16BIT) {
-		const u16 *ind16 = (const u16 *)inds;
+		const u16_le *ind16 = (const u16_le *)inds;
 		for (int i = 0; i < count; i++) {
 			u16 value = ind16[i];
 			if (value > upperBound)
@@ -127,7 +127,7 @@ void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBo
 		}
 	} else if (idx == GE_VTYPE_IDX_32BIT) {
 		WARN_LOG_REPORT_ONCE(indexBounds32, G3D, "GetIndexBounds: Decoding 32-bit indexes");
-		const u32 *ind32 = (const u32 *)inds;
+		const u32_le *ind32 = (const u32_le *)inds;
 		for (int i = 0; i < count; i++) {
 			u16 value = (u16)ind32[i];
 			// These aren't documented and should be rare.  Let's bounds check each one.
@@ -496,41 +496,42 @@ void VertexDecoder::Step_ColorInvalid() const
 
 void VertexDecoder::Step_Color565() const
 {
-	u8 *c = decoded_ + decFmt.c0off;
+	u32 *c = (u32*)(decoded_ + decFmt.c0off);
 	u16 cdata = *(u16_le *)(ptr_ + coloff);
-	c[0] = Convert5To8(cdata & 0x1f);
-	c[1] = Convert6To8((cdata >> 5) & 0x3f);
-	c[2] = Convert5To8((cdata >> 11) & 0x1f);
-	c[3] = 255;
+	*c = Convert5To8(cdata & 0x1f);
+	*c |= Convert6To8((cdata >> 5) & 0x3f) << 8;
+	*c |= Convert5To8((cdata >> 11) & 0x1f) << 16;
+	*c |= 255 << 24;
 	// Always full alpha.
 }
 
 void VertexDecoder::Step_Color5551() const
 {
-	u8 *c = decoded_ + decFmt.c0off;
+	u32 *c = (u32*)(decoded_ + decFmt.c0off);
 	u16 cdata = *(u16_le *)(ptr_ + coloff);
 	gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 15) != 0;
-	c[0] = Convert5To8(cdata & 0x1f);
-	c[1] = Convert5To8((cdata >> 5) & 0x1f);
-	c[2] = Convert5To8((cdata >> 10) & 0x1f);
-	c[3] = (cdata >> 15) ? 255 : 0;
+	*c = Convert5To8(cdata & 0x1f);
+	*c |= Convert5To8((cdata >> 5) & 0x1f) << 8;
+	*c |= Convert5To8((cdata >> 10) & 0x1f) << 16;
+	*c |= (cdata >> 15) ? 255 << 24 : 0;
 }
 
 void VertexDecoder::Step_Color4444() const
 {
-	u8 *c = decoded_ + decFmt.c0off;
+	u32 *c = (u32*)(decoded_ + decFmt.c0off);
 	u16 cdata = *(u16_le *)(ptr_ + coloff);
 	gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 12) == 0xF;
+	*c = 0;
 	for (int j = 0; j < 4; j++)
-		c[j] = Convert4To8((cdata >> (j * 4)) & 0xF);
+		*c |= Convert4To8((cdata >> (j * 4)) & 0xF) << (j * 8);
 }
 
 void VertexDecoder::Step_Color8888() const
 {
-	u8 *c = decoded_ + decFmt.c0off;
-	const u8 *cdata = (const u8*)(ptr_ + coloff);
-	gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && cdata[3] == 255;
-	memcpy(c, cdata, sizeof(u8) * 4);
+	u32 *c = (u32*)(decoded_ + decFmt.c0off);
+	u32 cdata = *(u32_le*)(ptr_ + coloff);
+	gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (cdata >> 24) == 0xFF;
+	*c = cdata;
 }
 
 void VertexDecoder::Step_Color565Morph() const
@@ -750,9 +751,10 @@ void VertexDecoder::Step_PosS16() const
 
 void VertexDecoder::Step_PosFloat() const
 {
-	u8 *v = (u8 *)(decoded_ + decFmt.posoff);
-	const u8 *fv = (const u8*)(ptr_ + posoff);
-	memcpy(v, fv, 12);
+	float *pos = (float *)(decoded_ + decFmt.posoff);
+	const float_le *fv = (const float_le *)(ptr_ + posoff);
+	for (int j = 0; j < 3; j++)
+		pos[j] = fv[j];
 }
 
 void VertexDecoder::Step_PosS8Skin() const
@@ -800,9 +802,11 @@ void VertexDecoder::Step_PosS16Through() const
 
 void VertexDecoder::Step_PosFloatThrough() const
 {
-	u8 *v = (u8 *)(decoded_ + decFmt.posoff);
-	const u8 *fv = (const u8 *)(ptr_ + posoff);
-	memcpy(v, fv, 12);
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	const float_le *fv = (const float_le*)(ptr_ + posoff);
+	v[0] = fv[0];
+	v[1] = fv[1];
+	v[2] = fv[2];
 }
 
 void VertexDecoder::Step_PosS8Morph() const
@@ -1355,6 +1359,8 @@ std::string VertexDecoder::GetString(DebugShaderStringType stringType) {
 			lines = DisassembleArm2((const u8 *)jitted_, jittedSize_);
 #elif defined(MIPS)
 			// No MIPS disassembler defined
+#elif defined(__PPC__)
+			// No PPC disassembler defined
 #else
 			lines = DisassembleX86((const u8 *)jitted_, jittedSize_);
 #endif

diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
@@ -973,7 +973,7 @@ void GPUCommon::FastRunLoop(DisplayList &list) {
 	int dc = downcount;
 	for (; dc > 0; --dc) {
 		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
-		const u32 op = *(const u32 *)(Memory::base + list.pc);
+		const u32 op = Memory::ReadUnchecked_U32(list.pc);
 		const u32 cmd = op >> 24;
 		const CommandInfo &info = cmdInfo[cmd];
 		const u32 diff = op ^ gstate.cmdmem[cmd];
@@ -2151,8 +2151,8 @@ void GPUCommon::FlushImm() {
 	// through vertices.
 	// Since the only known use is Thrillville and it only uses it to clear, we just use color and pos.
 	struct ImmVertex {
-		uint32_t color;
-		float xyz[3];
+		u32_le color;
+		float_le xyz[3];
 	};
 	ImmVertex temp[MAX_IMMBUFFER_SIZE];
 	for (int i = 0; i < immCount_; i++) {

diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
@@ -1503,7 +1503,7 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1)
 			DrawingCoords p = TransformUnit::ScreenToDrawing(pprime);
 
 			if ((z & 0xFF) == (z >> 8)) {
-				u16 *row = &depthbuf.as16[p.x + p.y * stride];
+				void *row = &depthbuf.as16[p.x + p.y * stride];
 				memset(row, z, w * 2);
 			} else {
 				for (int x = 0; x < w; ++x) {
@@ -1560,8 +1560,8 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1)
 		if (gstate.FrameBufFormat() == GE_FORMAT_8888) {
 			for (pprime.y = minY; pprime.y < maxY; pprime.y += 16) {
 				DrawingCoords p = TransformUnit::ScreenToDrawing(pprime);
-				if ((new_color & 0xFF) == (new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16)) {
-					u32 *row = &fb.as32[p.x + p.y * stride];
+				if ((new_color & 0xFF) == (u8)(new_color >> 8) && (new_color & 0xFFFF) == (new_color >> 16)) {
+					void *row = &fb.as32[p.x + p.y * stride];
 					memset(row, new_color, w * 4);
 				} else {
 					for (int x = 0; x < w; ++x) {
@@ -1573,7 +1573,7 @@ void ClearRectangle(const VertexData &v0, const VertexData &v1)
 			for (pprime.y = minY; pprime.y < maxY; pprime.y += 16) {
 				DrawingCoords p = TransformUnit::ScreenToDrawing(pprime);
 				if ((new_color16 & 0xFF) == (new_color16 >> 8)) {
-					u16 *row = &fb.as16[p.x + p.y * stride];
+					void *row = &fb.as16[p.x + p.y * stride];
 					memset(row, new_color16, w * 2);
 				} else {
 					for (int x = 0; x < w; ++x) {

diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp
@@ -277,16 +277,16 @@ static inline u32 LookupColor(unsigned int index, unsigned int level)
 
 	switch (gstate.getClutPaletteFormat()) {
 	case GE_CMODE_16BIT_BGR5650:
-		return RGB565ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
+		return RGB565ToRGBA8888(reinterpret_cast<u16_le*>(clut)[index + clutSharingOffset]);
 
 	case GE_CMODE_16BIT_ABGR5551:
-		return RGBA5551ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
+		return RGBA5551ToRGBA8888(reinterpret_cast<u16_le*>(clut)[index + clutSharingOffset]);
 
 	case GE_CMODE_16BIT_ABGR4444:
-		return RGBA4444ToRGBA8888(reinterpret_cast<u16*>(clut)[index + clutSharingOffset]);
+		return RGBA4444ToRGBA8888(reinterpret_cast<u16_le*>(clut)[index + clutSharingOffset]);
 
 	case GE_CMODE_32BIT_ABGR8888:
-		return clut[index + clutSharingOffset];
+		return reinterpret_cast<u32_le*>(clut)[index + clutSharingOffset];
 
 	default:
 		ERROR_LOG_REPORT(G3D, "Software: Unsupported palette format: %x", gstate.getClutPaletteFormat());
@@ -319,43 +319,43 @@ inline static Nearest4 SampleNearest(int u[N], int v[N], const u8 *srcptr, int t
 	case GE_TFMT_4444:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
-			res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src);
+			res.v[i] = RGBA4444ToRGBA8888(*(const u16_le *)src);
 		}
 		return res;
 
 	case GE_TFMT_5551:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
-			res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src);
+			res.v[i] = RGBA5551ToRGBA8888(*(const u16_le *)src);
 		}
 		return res;
 
 	case GE_TFMT_5650:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
-			res.v[i] = RGB565ToRGBA8888(*(const u16 *)src);
+			res.v[i] = RGB565ToRGBA8888(*(const u16_le *)src);
 		}
 		return res;
 
 	case GE_TFMT_8888:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]);
-			res.v[i] = *(const u32 *)src;
+			res.v[i] = *(const u32_le *)src;
 		}
 		return res;
 
 	case GE_TFMT_CLUT32:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i]);
-			u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24);
+			u32 val = *(u32_le *)src;
 			res.v[i] = LookupColor(gstate.transformClutIndex(val), 0);
 		}
 		return res;
 
 	case GE_TFMT_CLUT16:
 		for (int i = 0; i < N; ++i) {
 			const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i]);
-			u16 val = src[0] + (src[1] << 8);
+			u16 val = *(u16_le *)src;
 			res.v[i] = LookupColor(gstate.transformClutIndex(val), 0);
 		}
 		return res;