From 4778bda90f966ef6f5a9ac87ea1957ac23239c5b Mon Sep 17 00:00:00 2001
From: Sauraen <sauraen@gmail.com>
Date: Fri, 30 Aug 2024 22:10:46 -0700
Subject: [PATCH] New dir light transform working

---
 docs/Documentation/Performance.md |  45 ++++++-----
 f3dex3.s                          | 123 ++++++++++++++++--------------
 2 files changed, 91 insertions(+), 77 deletions(-)

diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md
index b167187..4932e87 100644
--- a/docs/Documentation/Performance.md
+++ b/docs/Documentation/Performance.md
@@ -7,24 +7,21 @@ visual effects are desired and increasing the RSP time a bit does not affect the
 overall performance. If your game is RSP bound, using the base version of F3DEX3
 will make it slower.
 
-Conversely, F3DEX3_LVP_NOC was created with the goal of matching the RSP
-performance of F3DEX2 on all critical paths in the microcode: command dispatch,
-vertex processing, and triangle processing. Then, the RDP and memory traffic
-performance improvements of F3DEX3--56 vertex buffer, auto-batched rendering,
-etc.--should improve performance from there. This means that F3DEX3_LVP_NOC can
-improve performance regardless of whether your game is RSP bound or RDP bound.
-
-Note that F3DEX3_LVP_NOC is still slightly slower than F3DEX2 for various other
-tasks--for example, the one-time setup when loading vertices, outside the loop
-over vertices, is a little slower.
+Conversely, F3DEX3_LVP_NOC matches or beats the RSP performance of F3DEX2 on all
+critical paths in the microcode, including command dispatch, vertex processing,
+and triangle processing. Then, the RDP and memory traffic performance
+improvements of F3DEX3--56 vertex buffer, auto-batched rendering, etc.--should
+further improve performance from there. This means that switching from F3DEX2 to
+F3DEX3_LVP_NOC should always improve performance regardless of whether your game
+is RSP bound or RDP bound.
 
 
 # Performance Results
 
-These are cycle counts for all the critical paths in the microcode. Lower is
+These are cycle counts for many key paths in the microcode. Lower numbers are
 better. The timings are hand-counted taking into account all pipeline stalls and
-all dual-issue conditions. Instruction alignment is sometimes taken into
-account, otherwise assumed to be optimal.
+all dual-issue conditions. Instruction alignment after branches is sometimes
+taken into account, otherwise assumed to be optimal.
 
 Vertex / lighting numbers assume no special features (texgen, packed normals,
 etc.) Tri numbers assume texture, shade, and Z, and not flushing the buffer.
@@ -47,6 +44,16 @@ measured yet".
 | Vtx pair, 7 dir lts        | 118    | 112            | 138        | 356        | 375    |
 | Vtx pair, 8 dir lts        | Can't  | 119            | 145        | 385        | 404    |
 | Vtx pair, 9 dir lts        | Can't  | 126            | 152        | 414        | 433    |
+| Light dir xfrm, 0 dir lts  | Can't  | 95             | 95         | None       | None   |
+| Light dir xfrm, 1 dir lt   | 141    | 95             | 95         | None       | None   |
+| Light dir xfrm, 2 dir lts  | 180    | 96             | 96         | None       | None   |
+| Light dir xfrm, 3 dir lts  | 219    | 121            | 121        | None       | None   |
+| Light dir xfrm, 4 dir lts  | 258    | 122            | 122        | None       | None   |
+| Light dir xfrm, 5 dir lts  | 297    | 147            | 147        | None       | None   |
+| Light dir xfrm, 6 dir lts  | 336    | 148            | 148        | None       | None   |
+| Light dir xfrm, 7 dir lts  | 375    | 173            | 173        | None       | None   |
+| Light dir xfrm, 8 dir lts  | Can't  | 174            | 174        | None       | None   |
+| Light dir xfrm, 9 dir lts  | Can't  | 199            | 199        | None       | None   |
 | Only/2nd tri to offscreen  | 27     | 26             | 26         | 26         | 26     |
 | 1st tri to offscreen       | 28     | 27             | 27         | 27         | 27     |
 | Only/2nd tri to clip       | 32     | 31             | 31         | 31         | 31     |
@@ -75,12 +82,12 @@ configuration.
 
 | Microcode      | Scene 1 | Scene 2 | Scene 3 |
 |----------------|---------|---------|---------|
-| F3DEX3         | 7.64ms  | 3.13ms  | 2.37ms  |
-| F3DEX3_NOC     | 7.07ms  | 2.89ms  | 2.14ms  |
-| F3DEX3_LVP     | 4.57ms  | 1.77ms  | 1.67ms  |
-| F3DEX3_LVP_NOC | Outdated  | | |
-| F3DEX2         | No*     | No*     | No*     |
-| Vertex count   | 3664    | 1608    | 1608    |
+| F3DEX3         | 7.41ms  | 2.99ms  | 2.22ms  |
+| F3DEX3_NOC     | 6.85ms  | 2.75ms  | 1.98ms  |
+| F3DEX3_LVP     | 4.12ms  | 1.59ms  | 1.48ms  |
+| F3DEX3_LVP_NOC | 3.34ms  | 1.27ms  | 1.16ms  |
+| F3DEX2         | Can't*  | Can't*  | Can't*  |
+| Vertex count   | 3557    | 1548    | 1548    |
 
 *F3DEX2 does not contain performance counters, so the portion of the RSP time
 taken for vertex processing cannot be measured.
diff --git a/f3dex3.s b/f3dex3.s
index 743200b..55697cd 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -3214,20 +3214,19 @@ lt_continue_setup:
     addi    $3, $3, altBase           // Point to ambient light; stored through vtx proc
     andi    $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store
     and     $11, $11, $7              // Zero if either matrix or lights invalid
-    bnez    $11, lt_setup_skip_xfrm
+    bnez    $11, lt_setup_after_xfrm
      sb     $10, dirLightsXfrmValid
 xfrm_dir_lights:
     // Transform directional lights' direction by M transpose.
     // First, load M transpose. Can use any regs except $v8-$v12, $v28-$v31.
-    // This algorithm clobbers all of $v0-$v7 and $v16-$v23 with the transposes.
+    // This algorithm clobbers all of $v0-$v7 and $v16-$v23 with the transposes;
+    // it's mainly just an excuse to use the rare ltv and swv instructions.
     // The F3DEX2 implementation takes 18 instructions and 11 cycles.
-    // This implementation is 23 instructions and 17 cycles, including the scalar
-    // setup for below. But this version loads M transpose to both halves of
-    // each vector so we can process two lights at a time, which matters because
-    // there's always at least 3 lights (technically 2 for EX3)--the lookat
-    // directions. So a few extra cycles here to save at least one light loop
-    // iteration is worth it. This implementation is mainly just an excuse to
-    // use the rare ltv and swv instructions.
+    // This implementation is 23 instructions and 17 cycles, but this version
+    // loads M transpose to both halves of each vector so we can process two
+    // lights at a time, which matters because there's always at least 3 lights
+    // (technically 2 for EX3)--the lookat directions. Plus, those 17 cycles
+    // also include a few instructions starting the loop.
     // Memory at mMatrix contains, in shorts within qwords, for the elements we care about:
     // A B C - D E F - (X int, Y int)
     // G H I - - - - - (Z int, W int)
@@ -3256,68 +3255,76 @@ xfrm_dir_lights:
     vmrg    $v0, $v0, $v4[0q]                 // A D G - A D G -
     lsv     $v18[14], (mMatrix + 0x2C)($zero) // U - O R U - O(R)
     vmrg    $v2, $v2, $v6[0q]                 // I - C F I - C F
-    move    curLight, $3
+    lpv     $v3[0], (lightBufferLookat - altBase)(altBaseReg) // Lookat 0 and 1
     vmudh   $v17, vOne, $v17[1q]              // N - T - N - T -
-    // nop
+    li      curLight, altBase - 4 * lightSize // + ltBufOfs = light -4; write pointer
     vmrg    $v1, $v1, $v5                     // B E H - B E H -
     // nop
+    // Interleave the start of transforming pairs of dir lights, including lookat.
     vmrg    $v16, $v16, $v20[0q]              // M P S - M P S -
     swv     $v18[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores O R U - O R U -
-    // vnop
+    vmudh   $v29, $v0,  $v3[0h]
     lqv     $v18,    (tempXfrmSingle)(rdpCmdBufEndP1)
     vmrg    $v17, $v17, $v21                  // N Q T - N Q T -
     swv     $v2[4],  (tempXfrmSingle)(rdpCmdBufEndP1) // Stores C F I - C F I -
-    // vnop
+    vmadh   $v29, $v1,  $v3[1h]
     lqv     $v2,     (tempXfrmSingle)(rdpCmdBufEndP1)
-    
-xfrm_light_loop:
-    beq     curLight, altBaseReg, xfrm_light_post
-     lpv    $v3,  (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
-    addi    $20, curLight, (ltBufOfs + 12 - lightSize) // Target = last word of light
-    addi    curLight, curLight, -lightSize
-    j       xfrm_single_dir
-     li     $ra, xfrm_light_loop
-    
-xfrm_light_post:
-    // Lookat 0: input already in $v3, target is xfrmLookatDirs.
-    jal     xfrm_single_dir
-     li     $20, OSTask + OSTask_ucode_data //xfrmLookatDirs
-    // Lookat 1: curLight still pointing to light 0, target is 4 bytes later.
-    lpv     $v3[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
-    jal     xfrm_single_dir
-     li     $20, OSTask + OSTask_ucode_data_size
-lt_setup_skip_xfrm:
-    // Load first light direction to $v13, which is not used throughout vtx processing.
-    j       vtx_after_lt_setup
-     lpv    $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6
-
-xfrm_single_dir:
-    vmudn   $v29, $v16, $v3[0]
-    vmadh   $v29, $v0,  $v3[0]
-    vmadn   $v29, $v17, $v3[1]
-    vmadh   $v29, $v1,  $v3[1]
-    vmadn   $v29, $v18, $v3[2]
-    vmadh   $v4,  $v2,  $v3[2]   // $v4[0:2] = light dir in model space
+    vmadn   $v29, $v16, $v3[0h]
+    // 18 cycles
+xfrm_light_loop_1:
+    vmadn   $v29, $v18, $v3[2h]
+xfrm_light_loop_2:
+    vmadn   $v29, $v17, $v3[1h]
+    vmadh   $v4,  $v2,  $v3[2h]  // $v4[0:2] and [4:6] = two lights dir in model space
+    vrsqh   $v29[0], $v20[0]
+    vrsql   $v23[0], $v21[0]
+    vrsqh   $v22[0], $v20[4]
+    addi    curLight, curLight, 2 * lightSize // Iters: -2, 0, 2, ...
+    vrsql   $v23[4], $v21[4]
+    lw      $20, (ltBufOfs + 8 + 2 * lightSize)(curLight) // First iter = light 0
+    vrsqh   $v22[4], $v31[2]     // 0
+    lw      $24, (ltBufOfs + 8 + 3 * lightSize)(curLight) // First iter = light 1
     vmudh   $v29, $v4, $v4       // Squared
+    sub     $10, curLight, altBaseReg // Is curLight (write ptr) <= 0?
     vreadacc $v7, ACC_MIDDLE     // Read not-clamped value
+    sub     $11, curLight, $3    // Is curLight (write ptr) <, =, or > ambient light?
     vreadacc $v6, ACC_UPPER
-    vmudm   $v29, vOne, $v7[2]   // Sum of squared components
-    vmadh   $v29, vOne, $v6[2]
-    vmadm   $v29, vOne, $v7[1]
-    vmadh   $v29, vOne, $v6[1]
-    vmadn   $v7,  $v7,  vOne     // elem 0; swapped so we can do vmadn and get result
-    vmadh   $v6,  $v6,  vOne
-    vrsqh   $v29[0], $v6[0]
-    vrsql   $v7[0], $v7[0]
-    vrsqh   $v6[0], $v31[2]      // 0
-    vmudm   $v29, $v4, $v7[0]    // Vec int * frac scaling
-    vmadh   $v4, $v4, $v6[0]     // Vec int * int scaling
-    spv     $v4[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Store elem 0-2 as bytes to temp memory
-    lw      $11, (tempXfrmSingle)(rdpCmdBufEndP1)    // Load 3 (4) bytes to scalar unit
-    jr      $ra
-     sw     $11, (0)($20)                  // Store 3 (4) bytes to target address
-     // This clobbers the specular size
+    sw      $20,    (tempXfrmSingle)(rdpCmdBufEndP1) // Store light 0
+    vmudm   $v29, $v19, $v23[0h] // Vec int * frac scaling
+    sw      $24,    (tempXfrmSingle + 4)(rdpCmdBufEndP1) // Store light 1
+    vmadh   $v5,  $v19, $v22[0h] // Vec int * int scaling
+    lpv     $v3[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Load dirs 0-2, 4-6
+    vmudm   $v29, vOne, $v7[2h]  // Sum of squared components
+    vmadh   $v29, vOne, $v6[2h]
+    vmadm   $v29, vOne, $v7[1h]
+    vmadh   $v29, vOne, $v6[1h]
+    spv     $v5[0], (tempXfrmSingle)(rdpCmdBufEndP1) // Store elem 0-2, 4-6 as bytes to temp memory
+    vmadn   $v21, $v7,  vOne     // elem 0, 4; swapped so we can do vmadn and get result
+    lw      $20,    (tempXfrmSingle)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
+    vmadh   $v20, $v6,  vOne
+    lw      $24,    (tempXfrmSingle + 4)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
+    vcopy   $v19, $v4
+    blez    $10, xfrm_light_store_lookat // curLight = -2 or 0
+     vmudh  $v29, $v0,  $v3[0h]
+     // 20 cycles from xfrm_light_loop_2 not counting land
+    vmadh   $v29, $v1,  $v3[1h]
+    bgtz    $11, lt_setup_after_xfrm // curLight > ambient; only one light valid
+     sw     $20, (ltBufOfs + 0xC - 2 * lightSize)(curLight) // Write light relative -2
+    vmadn   $v29, $v16, $v3[0h]
+    bltz    $11, xfrm_light_loop_1   // curLight < ambient; more lights to compute
+     sw     $24, (ltBufOfs + 0xC - 1 * lightSize)(curLight) // Write light relative -1
+lt_setup_after_xfrm:
+    // Load first light direction to $v13, which is not used throughout vtx processing.
+    j       vtx_after_lt_setup
+     lpv    $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6
     
+xfrm_light_store_lookat:
+    vmadh   $v29, $v1,  $v3[1h]
+    spv     $v5[0], (xfrmLookatDirs)($zero) // First time is garbage; second actual
+    vmadn   $v29, $v16, $v3[0h]
+    j       xfrm_light_loop_2
+     vmadn  $v29, $v18, $v3[2h]
+
 
 .if CFG_NO_OCCLUSION_PLANE // New LVP_NOC
 .align 8