hrydgard · unknownbrackets · Sep 18, 2022 · Sep 18, 2022 · Sep 18, 2022 · hrydgard
diff --git a/GPU/Common/VertexDecoderArm.cpp b/GPU/Common/VertexDecoderArm.cpp
@@ -872,22 +872,14 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
 	STMIA(scratchReg, false, 3, tempReg1, tempReg2, tempReg3);
 }
 
-// Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
-	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
 	_dbg_assert_msg_(fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
 	_dbg_assert_msg_(fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
 
-	// TODO: SIMD
-	LDRSB(tempReg1, srcReg, dec_->posoff);
-	LDRSB(tempReg2, srcReg, dec_->posoff + 1);
-	LDRB(tempReg3, srcReg, dec_->posoff + 2);
-	static const ARMReg tr[3] = { tempReg1, tempReg2, tempReg3 };
-	static const ARMReg fr[3] = { fpScratchReg, fpScratchReg2, fpScratchReg3 };
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	VEOR(neonScratchReg, neonScratchReg, neonScratchReg);
+	VEOR(neonScratchReg2, neonScratchReg, neonScratchReg);
 	ADD(scratchReg, dstReg, dec_->decFmt.posoff);
-	VMOV(neonScratchReg, tempReg1, tempReg2);
-	VMOV(neonScratchReg2, tempReg3, tempReg3);
-	VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
 	VST1(F_32, neonScratchReg, scratchReg, 2, ALIGN_NONE);
 }
 

diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
@@ -668,15 +668,11 @@ void VertexDecoderJitCache::Jit_PosFloat() {
 }
 
 void VertexDecoderJitCache::Jit_PosS8Through() {
-	LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
-	LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 1);
-	LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 2);
-	fp.SCVTF(fpScratchReg, tempReg1);
-	fp.SCVTF(fpScratchReg2, tempReg2);
-	fp.SCVTF(fpScratchReg3, tempReg3);
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	fp.EOR(fpScratchReg, fpScratchReg, fpScratchReg);
 	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
+	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 4);
+	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff + 8);
 }
 
 void VertexDecoderJitCache::Jit_PosS16Through() {

diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp
@@ -773,14 +773,20 @@ void VertexDecoder::Step_PosFloatSkin() const
 	Vec3ByMatrix43(pos, fn, skinMatrix);
 }
 
-void VertexDecoder::Step_PosS8Through() const
-{
+void VertexDecoder::Step_PosInvalid() const {
+	// Invalid positions are just culled.  Simulate by forcing invalid values.
 	float *v = (float *)(decoded_ + decFmt.posoff);
-	const s8 *sv = (const s8 *)(ptr_ + posoff);
-	const u8 *uv = (const u8 *)(ptr_ + posoff);
-	v[0] = sv[0];
-	v[1] = sv[1];
-	v[2] = uv[2];
+	v[0] = std::numeric_limits<float>::infinity();
+	v[1] = std::numeric_limits<float>::infinity();
+	v[2] = std::numeric_limits<float>::infinity();
+}
+
+void VertexDecoder::Step_PosS8Through() const {
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	float *v = (float *)(decoded_ + decFmt.posoff);
+	v[0] = 0;
+	v[1] = 0;
+	v[2] = 0;
 }
 
 void VertexDecoder::Step_PosS16Through() const
@@ -1023,35 +1029,35 @@ static const StepFunction nrmstep_morphskin[4] = {
 };
 
 static const StepFunction posstep[4] = {
-	&VertexDecoder::Step_PosS8,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8,
 	&VertexDecoder::Step_PosS16,
 	&VertexDecoder::Step_PosFloat,
 };
 
 static const StepFunction posstep_skin[4] = {
-	&VertexDecoder::Step_PosS8Skin,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Skin,
 	&VertexDecoder::Step_PosS16Skin,
 	&VertexDecoder::Step_PosFloatSkin,
 };
 
 static const StepFunction posstep_morph[4] = {
-	&VertexDecoder::Step_PosS8Morph,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Morph,
 	&VertexDecoder::Step_PosS16Morph,
 	&VertexDecoder::Step_PosFloatMorph,
 };
 
 static const StepFunction posstep_morph_skin[4] = {
-	&VertexDecoder::Step_PosS8MorphSkin,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8MorphSkin,
 	&VertexDecoder::Step_PosS16MorphSkin,
 	&VertexDecoder::Step_PosFloatMorphSkin,
 };
 
 static const StepFunction posstep_through[4] = {
-	&VertexDecoder::Step_PosS8Through,
+	&VertexDecoder::Step_PosInvalid,
 	&VertexDecoder::Step_PosS8Through,
 	&VertexDecoder::Step_PosS16Through,
 	&VertexDecoder::Step_PosFloatThrough,
@@ -1224,9 +1230,8 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
 	bool reportNoPos = false;
 	if (!pos) {
 		reportNoPos = true;
-		pos = 1;
 	}
-	if (pos) { // there's always a position
+	if (pos >= 0) { // there's always a position
 		size = align(size, posalign[pos]);
 		posoff = size;
 		size += possize[pos];

diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
@@ -433,6 +433,7 @@ class VertexDecoder {
 	void Step_PosS16MorphSkin() const;
 	void Step_PosFloatMorphSkin() const;
 
+	void Step_PosInvalid() const;
 	void Step_PosS8Through() const;
 	void Step_PosS16Through() const;
 	void Step_PosFloatThrough() const;

diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
@@ -1345,14 +1345,9 @@ void VertexDecoderJitCache::Jit_NormalFloatSkin() {
 
 // Through expands into floats, always. Might want to look at changing this.
 void VertexDecoderJitCache::Jit_PosS8Through() {
-	DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
 	// SIMD doesn't really matter since this isn't useful on hardware.
+	XORPS(fpScratchReg, R(fpScratchReg));
 	for (int i = 0; i < 3; i++) {
-		if (i == 2)
-			MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
-		else
-			MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
-		CVTSI2SS(fpScratchReg, R(tempReg1));
 		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), fpScratchReg);
 	}
 }

diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
@@ -503,10 +503,8 @@ void TransformUnit::SubmitPrimitive(const void* vertices, const void* indices, G
 	if (gstate_c.skipDrawReason & SKIPDRAW_SKIPFRAME) {
 		return;
 	}
-	// Throughmode never draws 8-bit primitives, maybe because they can't fully specify the screen?
-	if ((vertex_type & GE_VTYPE_THROUGH_MASK) != 0 && (vertex_type & GE_VTYPE_POS_MASK) == GE_VTYPE_POS_8BIT)
-		return;
 	// Vertices without position are just entirely culled.
+	// Note: Throughmode does draw 8-bit primitives, but positions are always zero - handled in decode.
 	if ((vertex_type & GE_VTYPE_POS_MASK) == 0)
 		return;
 

diff --git a/headless/Headless.cpp b/headless/Headless.cpp
@@ -126,7 +126,7 @@ int printUsage(const char *progname, const char *reason)
 	fprintf(stderr, "  -m, --mount umd.cso   mount iso on umd1:\n");
 	fprintf(stderr, "  -r, --root some/path  mount path on host0: (elfs must be in here)\n");
 	fprintf(stderr, "  -l, --log             full log output, not just emulated printfs\n");
-	fprintf(stderr, " --debugger=PORT        enable websocket debugger and break at start\n");
+	fprintf(stderr, "  --debugger=PORT       enable websocket debugger and break at start\n");
 
 	fprintf(stderr, "  --graphics=BACKEND    use a different gpu backend\n");
 	fprintf(stderr, "                        options: gles, software, directx9, etc.\n");

diff --git a/test.py b/test.py
@@ -147,6 +147,7 @@ def target():
   "gpu/commands/blend",
   "gpu/commands/blend565",
   "gpu/commands/blocktransfer",
+  "gpu/commands/fog",
   "gpu/commands/material",
   "gpu/displaylist/alignment",
   "gpu/dither/dither",
@@ -157,7 +158,11 @@ def target():
   "gpu/ge/enqueueparam",
   "gpu/ge/queue",
   "gpu/primitives/indices",
+  "gpu/primitives/invalidprim",
+  "gpu/primitives/trianglefan",
+  "gpu/primitives/trianglestrip",
   "gpu/primitives/triangles",
+  "gpu/rendertarget/copy",
   "gpu/rendertarget/depal",
   "gpu/signals/pause",
   "gpu/signals/pause2",
@@ -269,6 +274,7 @@ def target():
   "threads/mutex/refer",
   "threads/mutex/try",
   "threads/mutex/unlock",
+  "threads/mutex/unlock2",
   "threads/semaphores/semaphores",
   "threads/semaphores/cancel",
   "threads/semaphores/create",
@@ -394,21 +400,19 @@ def target():
   "gpu/ge/get",
   "gpu/primitives/bezier",
   "gpu/primitives/continue",
-  "gpu/primitives/invalidprim",
+  "gpu/primitives/immediate",
   "gpu/primitives/lines",
   "gpu/primitives/linestrip",
   "gpu/primitives/points",
   "gpu/primitives/rectangles",
   "gpu/primitives/spline",
-  "gpu/primitives/trianglefan",
-  "gpu/primitives/trianglestrip",
   "gpu/reflection/reflection",
-  "gpu/rendertarget/copy",
   "gpu/rendertarget/rendertarget",
   "gpu/signals/continue",
   "gpu/signals/jumps",
   "gpu/signals/simple",
   "gpu/simple/simple",
+  "gpu/textures/size",
   "gpu/triangle/triangle",
   "gpu/vertices/colors",
   "gpu/vertices/texcoords",