From 52057557d14df9a9603c968934546e32aa8c44d5 Mon Sep 17 00:00:00 2001
From: Sauraen <sauraen@gmail.com>
Date: Wed, 1 Jan 2025 21:41:26 -0800
Subject: [PATCH] Working on lighting setup rewrite

---
 f3dex3.s    | 1359 ++++++++++++++-------------------------------------
 gbi.h       |   17 +-
 rsp/gbi.inc |    4 +-
 3 files changed, 389 insertions(+), 991 deletions(-)

diff --git a/f3dex3.s b/f3dex3.s
index fc4188c..b743970 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -258,10 +258,10 @@ texrectWord2:
 rdpHalf1Val:
     .fill 4
     
-dirLightsXfrmValid:
-    .db 0
+pointLightFlagOrDirXfrmValid:
+    .db 0   // Sign bit set if there are point lights. [1, 7F] if dir lights and xfrm valid.
 numLightsxSize:
-    .db 0   // Overwrites rdpHalf1Val when written
+    .db 0   // Overwrites second half of rdpHalf1Val when written
 
 // displaylist stack length
 displayListStackLength:
@@ -507,6 +507,14 @@ jumpTableEntry G_POPMTX_end   // G_POPMTX
 jumpTableEntry G_MTX_end      // G_MTX (multiply)
 jumpTableEntry G_MOVEMEM_end  // G_MOVEMEM, G_MTX (load)
 
+.if (. & 7) != 0
+    .error "packedConstants alignment broken"
+.endif
+packedConstants:  // See ltbasic_start_packed_ao for explanations of these values
+    .db 0xF8
+    .db 0xFC
+    .db 0x08
+
 .macro miniTableEntry, addr
     .if addr < 0x1000 || addr >= 0x1400
         .error "Handler address out of range!"
@@ -771,43 +779,42 @@ vZero equ $v0  // all elements = 0
 
 /*
 Scalar regs:
-      Tri write   Clip VW    Vtx write   Lighting  Cmd dispatch
-$zero -------------------- Hardwired zero ---------------------
-$1    v1 texptr   <---------- vtxLeft ---------->  temp, init 0
-$2    v2 shdptr   next vtx ----->                      temp
-$3    v3 shdflg   last/offscr                          temp
-$4    flat shading vtx or (perf) initial FIFO stall time ------
-$5    v1flag lmaj <--------- vGeomMid ---------->  
-$6    geom mode   clipMaskIdx -->  
-$7    v2flag tile <---------- fogFlag ---------->  cmd byte
-$8    v3flag      <---------- outVtx2 ---------->  cmdBufOver
-$9    xp texenab  clipMask -----> <----- curLight  ovlInitClock
-$10   ------------------------ temp2 --------------------------
-$11   ------------------------- temp --------------------------
-$12   --------------------- perfCounterD ----------------------
-$13   ---------------------- altBaseReg -----------------------
-$14               <----------- inVtx ----------->
-$15               <--------- outVtxBase -------->
-$16               clipFlags ----> 
-$17               clipPolyRead -> 
-$18   <---------- clipPolySelect> 
-$19      temp      onscrvtx   outVtx1 ---------->     dmaLen
-$20      temp     <---------- flagsV1 ---------->    dmemAddr
-$21   <---------- clipPolyWrite > <----- ambLight
-$22   -------------------- rdpCmdBufEndP1 ---------------------
-$23   --------------------- rdpCmdBufPtr ----------------------
-$24      temp     <---------- flagsV2 ---------->  cmd_w1_dram
-$25     cmd_w0 ----------------->                     cmd_w0
-$26   ---------------------- taskDataPtr ----------------------
-$27   -------------------- inputBufferPos ---------------------
-$28   --------------------- perfCounterA ----------------------
-$29   --------------------- perfCounterB ----------------------
-$30   --------------------- perfCounterC ----------------------
-$ra   return address, sometimes sign bit is flag --------------
+      Tri write   Clip VW       Vtx write   Lighting  Cmd dispatch
+$zero ---------------------- Hardwired zero ----------------------
+$1    v1 texptr   <------------- vtxLeft ---------->  temp, init 0
+$2    v2 shdptr   clipVNext ------->                      temp
+$3    v3 shdflg   clipVLastOfsc  vLoopRet --------->      temp
+$4    flat shading vtx or (perf) initial FIFO stall time ---------
+$5    v1flag lmaj <------------ vGeomMid ---------->  
+$6    geom mode   clipMaskIdx -----> <----- lbPacked
+$7    v2flag tile <------------- fogFlag ---------->  cmd byte
+$8    v3flag      <------------- outVtx2 ---------->  cmdBufOver
+$9    xp texenab  clipMask --------> <----- curLight  ovlInitClock
+$10   -------------------------- temp2 ---------------------------
+$11   --------------------------- temp ---------------------------
+$12   ----------------------- perfCounterD -----------------------
+$13   ------------------------ altBaseReg ------------------------
+$14               <-------------- inVtx ----------->
+$15               <------------ outVtxBase -------->
+$16               clipFlags ------->
+$17               clipPolyRead ---->
+$18   <---------- clipPolySelect --> 
+$19      temp     clipVOnsc      outVtx1 ---------->     dmaLen
+$20      temp     <------------- flagsV1 ---------->    dmemAddr
+$21   <---------- clipPolyWrite ---> <----- ambLight
+$22   ---------------------- rdpCmdBufEndP1 ----------------------
+$23   ----------------------- rdpCmdBufPtr -----------------------
+$24      temp     <------------- flagsV2 ---------->  cmd_w1_dram
+$25     cmd_w0 --------------------> <----- lbAfter      cmd_w0
+$26   ------------------------ taskDataPtr -----------------------
+$27   ---------------------- inputBufferPos ----------------------
+$28   ----------------------- perfCounterA -----------------------
+$29   ----------------------- perfCounterB -----------------------
+$30   ----------------------- perfCounterC -----------------------
+$ra   return address, sometimes sign bit is flag -----------------
 */
 
 // Global scalar regs:
-//                 $zero // Hardwired zero scalar register
 perfCounterD   equ $12   // Performance counter D (functions depend on config)
 altBaseReg     equ $13   // Alternate base address register for vector loads
 rdpCmdBufEndP1 equ $22   // Pointer to one command word past "end" (middle) of RDP command buf
@@ -817,10 +824,10 @@ inputBufferPos equ $27   // DMEM position within display list input buffer, rela
 perfCounterA   equ $28   // Performance counter A (functions depend on config)
 perfCounterB   equ $29   // Performance counter B (functions depend on config)
 perfCounterC   equ $30   // Performance counter C (functions depend on config)
-//                 $ra   // Return address
 
 // Vertex write:
 vtxLeft        equ $1    // Number of vertices left to process * 0x10
+vLoopRet       equ $3    // Return address at end of vtx loop = top of loop or misc lighting
 vGeomMid       equ $5    // Middle two bytes of geometry mode
 fogFlag        equ $7    // 8 if fog enabled, else 0
 outVtx2        equ $8    // Pointer to second or dummy (= outVtx1) transformed vert
@@ -830,11 +837,16 @@ outVtx1        equ $19   // Pointer to first transformed vert
 flagsV1        equ $20   // Clip flags for vertex 1
 flagsV2        equ $24   // Clip flags for vertex 2
 
-// Lighting
+// Lighting (lb = ltbasic)
+lbPacked       equ $6    // Nonzero if packed normals enabled
 curLight       equ $9    // Current light pointer with offset
 ambLight       equ $21   // Ambient (top) light pointer with offset
+lbAfter        equ $25   // Address to return to after main lighting loop (vertex or extras)
 
 // Clipping
+clipVNext      equ $2    // Next vertex (vertex at forward end of current edge)
+clipVLastOfsc  equ $3    // Last vertex / offscreen vertex
+clipVOnsc      equ $19   // Onscreen vertex
 clipMaskIdx    equ $6    // Clip mask index 4-0
 clipMask       equ $9    // Current clip mask (one bit)
 clipFlags      equ $16   // Current clipping flags being checked
@@ -975,109 +987,6 @@ tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C
 //////////////////////////////////// IMEM //////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 
-// Macros for placing code in different places based on the microcode version
-
-.macro instantiate_mtx_end_begin
-// Multiplies the temp loaded matrix into the M or VP matrix
-    lhu     $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
-    li      $3, tempMatrix // Input 1 = temp mem (loaded mtx)
-    jal     while_wait_dma_busy
-     move   $2, $6 // Input 0 = output
-    // Followed immediately by instantiate_mtx_multiply. These need to be broken
-    // up so we can insert the global mtx_multiply label between them.
-.endmacro
-.macro instantiate_mtx_multiply
-// $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx
-    addi    $10, $3, 0x0018
-@@loop:
-    vmadn   $v7, $v31, $v31[2]  // 0
-    addi    $11, $3, 0x0008
-    vmadh   $v6, $v31, $v31[2]  // 0
-    addi    $2, $2, -0x0020
-    vmudh   $v29, $v31, $v31[2] // 0
-@@innerloop:
-    ldv     $v3[0], 0x0040($2)
-    ldv     $v3[8], 0x0040($2)
-    lqv     $v1[0], 0x0020($3) // Input 1
-    ldv     $v2[0], 0x0020($2)
-    ldv     $v2[8], 0x0020($2)
-    lqv     $v0[0], 0x0000($3) // Input 1
-    vmadl   $v29, $v3, $v1[0h]
-    addi    $3, $3, 0x0002
-    vmadm   $v29, $v2, $v1[0h]
-    addi    $2, $2, 0x0008 // Increment input 0 pointer
-    vmadn   $v5, $v3, $v0[0h]
-    bne     $3, $11, @@innerloop
-     vmadh  $v4, $v2, $v0[0h]
-    bne     $3, $10, @@loop
-     addi   $3, $3, 0x0008
-    sqv     $v7[0], (0x0020)($6)
-    sqv     $v6[0], (0x0000)($6)
-.if CFG_LEGACY_VTX_PIPE
-    beqz    $7, vtx_after_mtx_multiply
-.endif
-     sqv    $v4[0], (0x0010)($6)
-    j       run_next_DL_command
-     sqv    $v5[0], (0x0030)($6)
-.endmacro
-
-.macro instantiate_branch_wz
-    lhu     $10, (vertexTable)(cmd_w0)  // Vertex addr from byte 3
-.if CFG_G_BRANCH_W                      // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
-    lh      $10, VTX_W_INT($10)         // read the w coordinate of the vertex (f3dzex)
-.else
-    lw      $10, VTX_SCR_Z($10)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
-.endif
-    sub     $2, $10, cmd_w1_dram        // subtract the w/z value being tested
-    bgez    $2, run_next_DL_command     // if vtx.w/z >= cmd w/z, continue running this DL
-     lw     cmd_w1_dram, rdpHalf1Val    // load the RDPHALF1 value as the location to branch to
-    j       branch_dl                   // need $2 < 0 for nopush and cmd_w1_dram
-     li     cmd_w0, 0                   // No count of DL cmds to skip
-.endmacro
-
-.macro instantiate_dma_io
-    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
-     lh     dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
-    andi    dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
-    // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
-    // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
-    sra     dmemAddr, dmemAddr, 2
-    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
-     li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command
-.endmacro
-
-.macro instantiate_memset
-    llv     $v2[0], (rdpHalf1Val)($zero) // Load the memset value
-    sll     cmd_w0, cmd_w0, 8           // Clear upper byte
-    jal     segmented_to_physical
-     srl    cmd_w0, cmd_w0, 8           // Number of bytes to memset (must be mult of 16)
-    li      $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart
-    jal     @@clamp_to_memset_buffer
-     vmudh  $v2, vOne, $v2[1]           // Move element 1 (lower bytes) to all
-    addi    $2, $2, memsetBufferStart   // First qword set is one below end
-@@pre_loop:
-    sqv     $v2, (-0x10)($2)
-    bne     $2, $3, @@pre_loop
-     addi   $2, -0x10
-@@transaction_loop:
-    jal     @@clamp_to_memset_buffer
-     li     dmemAddr, 0x8000 | memsetBufferStart  // Always write from start of buffer
-    jal     dma_read_write
-     addi   dmaLen, $2, -1
-    sub     cmd_w0, cmd_w0, $2
-    bgtz    cmd_w0, @@transaction_loop
-     add    cmd_w1_dram, cmd_w1_dram, $2
-    j       wait_for_dma_and_run_next_command
-     // Delay slot harmless
-@@clamp_to_memset_buffer:
-    addi    $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize)
-    sra     $10, $11, 31
-    and     $11, $11, $10
-    jr      $ra
-     addi   $2, $11, memsetBufferSize
-.endmacro
-
-
 // RSP IMEM
 .create CODE_FILE, 0x00001080
 
@@ -1191,12 +1100,9 @@ G_MOVEMEM_end:
     j       while_wait_dma_busy                         // wait for the DMA read to finish
      li     $ra, run_next_DL_command
 
-.if !CFG_LEGACY_VTX_PIPE
 G_DMA_IO_handler:
-G_BRANCH_WZ_handler:
 G_MEMSET_handler:
-    j       ovl234_ovl4_entrypoint           // Delay slot is harmless
-.endif
+    j       ovl234_clipmisc_entrypoint       // Delay slot is harmless
 load_cmds_handler:
      lb     $3, materialCullMode
     bltz    $3, run_next_DL_command  // If cull mode is < 0, in mat second time, skip the load
@@ -1301,7 +1207,17 @@ G_DMA_IO_handler:
     instantiate_dma_io
     
 G_BRANCH_WZ_handler:
-    instantiate_branch_wz
+    lhu     $10, (vertexTable)(cmd_w0)  // Vertex addr from byte 3
+.if CFG_G_BRANCH_W                      // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
+    lh      $10, VTX_W_INT($10)         // read the w coordinate of the vertex (f3dzex)
+.else
+    lw      $10, VTX_SCR_Z($10)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
+.endif
+    sub     $2, $10, cmd_w1_dram        // subtract the w/z value being tested
+    bgez    $2, run_next_DL_command     // if vtx.w/z >= cmd w/z, continue running this DL
+     lw     cmd_w1_dram, rdpHalf1Val    // load the RDPHALF1 value as the location to branch to
+    j       branch_dl                   // need $2 < 0 for nopush and cmd_w1_dram
+     li     cmd_w0, 0                   // No count of DL cmds to skip
     
 G_MEMSET_handler:
     instantiate_memset
@@ -1419,7 +1335,7 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
     vmrg    $v4, tHPos, $v8   // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
     mfc2    $9, $v26[0]       // elem 0 = x = cross product => lower 16 bits, sign extended
     vmrg    tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
-    bnez    $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip
+    bnez    $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
      // 30 cycles
      sll    $20, $6, 21       // Bit 10 in the sign bit, for facing cull
     vlt     $v29, $v6, $v2    // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
@@ -1811,36 +1727,59 @@ tri_fan_store:
 .if (. & 4)
     .warning "One instruction of padding before ovl234"
 .endif
-
 .align 8
+
+vtx_select_lighting:
+    bltz    $8, ovl234_ltadv_entrypoint  // Advanced lighting if have point lights
+     andi   $10, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
+    bnez    $10, ovl234_ltadv_entrypoint  // Advanced lighting if specular or Fresnel
+     lbu    ambLight, numLightsxSize
+    // Fallthrough to ltbasic on whichever overlay is loaded
+
+.if (. & 4)
+    .error "vtx_select_lighting must be an even number of instructions"
+.endif
 ovl234_start:
 
 ovl3_start:
 // Clipping overlay.
 
-// Jump here to do lighting. If overlay 3 is loaded (this code), loads overlay 2
+// Jump here for basic lighting setup. If overlay 3 is loaded (this code), loads overlay 2
 // and jumps to right here, which is now in the new code.
-ovl234_lighting_entrypoint_ovl3ver:        // same IMEM address as ovl234_lighting_entrypoint
+ovl234_ltbasic_entrypoint_ovl3ver:         // same IMEM address as ovl234_ltbasic_entrypoint
 .if CFG_PROFILING_B
     addi    perfCounterC, perfCounterC, 1  // Count lighting overlay load
 .endif
     jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
      li     cmd_w1_dram, orga(ovl2_start)  // set up a load for overlay 2
 
-.if !CFG_LEGACY_VTX_PIPE
-// Jump here for all overlay 4 features. If overlay 3 is loaded (this code), loads
+// Jump here for advanced lighting. If overlay 3 is loaded (this code), loads
 // overlay 4 and jumps to right here, which is now in the new code.
-ovl234_ovl4_entrypoint_ovl3ver:            // same IMEM address as ovl234_ovl4_entrypoint
+ovl234_ltadv_entrypoint_ovl3ver:           // same IMEM address as ovl234_ltadv_entrypoint
 .if CFG_PROFILING_B
     addi    perfCounterD, perfCounterD, 1  // Count overlay 4 load
 .endif
     jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
      li     cmd_w1_dram, orga(ovl4_start)  // set up a load for overlay 4
-.endif //!CFG_LEGACY_VTX_PIPE
 
-// Jump here to do clipping. If overlay 3 is loaded (this code), directly starts
+// Jump here for clipping and rare commands. If overlay 3 is loaded (this code), directly starts
 // the clipping code.
-ovl234_clipping_entrypoint:
+ovl234_clipmisc_entrypoint:
+    bnez    $1, clip_start                 // In clipping, $1 is vtx 1 addr, never 0. Cmd dispatch, $1 = 0.
+     li     $3, (0xFF00 | G_MEMSET)
+    beq     $3, $7, g_memset_ovl3
+     lw     cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Overwritten by overlay load
+g_dma_io_ovl3:  // otherwise
+    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
+     lh     dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
+    andi    dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
+    // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
+    // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
+    sra     dmemAddr, dmemAddr, 2
+    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
+     li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command
+
+clip_start:
     sh      $ra, tempTriRA                 // Tri return after clipping
 .if CFG_PROFILING_B
     addi    perfCounterB, perfCounterB, 1  // Increment clipped (input) tris count
@@ -1860,25 +1799,24 @@ clip_init_used_loop:
     // Write the current three verts as the initial polygon
     sh      $1, (clipPoly - 6 + 0)(clipPolySelect)
     sh      $2, (clipPoly - 6 + 2)(clipPolySelect)
-    sh      $3, (clipPoly - 6 + 4)(clipPolySelect)
-    sh      $zero, (clipPoly)(clipPolySelect)  // Zero to mark end of polygon
+    sh      $3, (clipPoly - 6 + 4)(clipPolySelect) // Initial state $3 = clipVLastOfs
+    sh      $zero, (clipPoly)(clipPolySelect)  // nullptr to mark end of polygon
     li      clipMask, CLIP_CAMPLANE            // Initial clip mask for no nearclipping
 // Available locals here: $11, $1, $7, $20, $24, $10
 clip_condlooptop: // Loop over six clipping conditions: near, far, +y, +x, -y, -x
-    lhu     clipFlags, VTX_CLIP($3)            // Load flags for V3, which will be the final vertex of the last polygon
-    and     clipFlags, clipFlags, clipMask     // Mask V3's flags to current clip condition
+    lhu     clipFlags, VTX_CLIP(clipVLastOfs)  // Load flags for final vertex of the last polygon
     addi    clipPolyRead,   clipPolySelect, -6 // Start reading at the beginning of the old polygon
-    xori    clipPolySelect, clipPolySelect, 6 ^ (clipPoly2 + 6 - clipPoly) // Swap to the other polygon memory
+    xori    clipPolySelect, clipPolySelect, 6 ^ ((clipPoly2 - clipPoly) + 6) // Swap to the other polygon memory
     addi    clipPolyWrite,  clipPolySelect, -6 // Start writing at the beginning of the new polygon
+    and     clipFlags, clipFlags, clipMask     // Mask last flags to current clip condition
 clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the edge
-    // Edge starts from V3, ends at V2
-    lhu     $2, (clipPoly)(clipPolyRead)       // Read next vertex of input polygon as V2 (end of edge)
+    lhu     clipVNext, (clipPoly)(clipPolyRead) // Read next vertex (farther end of edge)
     addi    clipPolyRead, clipPolyRead, 0x0002 // Increment read pointer
-    beqz    $2, clip_nextcond                  // If V2 is 0, done with input polygon
-     lhu    $11, VTX_CLIP($2)                  // Load flags for V2
-    and     $11, $11, clipMask                 // Mask V2's flags to current clip condition
+    beqz    clipVNext, clip_nextcond           // If next vtx is nullptr, done with input polygon
+     lhu    $11, VTX_CLIP(clipVNext)           // Load flags for next vtx
+    and     $11, $11, clipMask                 // Mask next flags to current clip condition
     beq     $11, clipFlags, clip_nextedge      // Both set or both clear = both off screen or both on screen, no subdivision
-     move   clipFlags, $11                     // clipFlags = masked V2's flags
+     move   clipFlags, $11                     // clipFlags = masked next vtx's flags
     // Going to subdivide this edge. Find available temp vertex slot.
     li      outVtxBase, clipTempVertsEnd
 clip_find_unused_loop:
@@ -1888,12 +1826,12 @@ clip_find_unused_loop:
      andi   $11, $11, CLIP_VTX_USED
     bnez    $11, clip_find_unused_loop
      addi   outVtxBase, outVtxBase, -vtxSize
-    beqz    clipFlags, clip_skipswap23         // V2 flag is clear / on screen, therefore V3 is set / off screen
-     move   $19, $2                            // 
-    move    $19, $3                            // Otherwise swap V2 and V3; note we are overwriting $3 but not $2
-    move    $3, $2                             // 
-clip_skipswap23: // After possible swap, $19 = vtx not meeting clip cond / on screen, $3 = vtx meeting clip cond / off screen
-    // Interpolate between these two vertices; create a new vertex which is on the
+    beqz    clipFlags, clip_skipswap23         // Next vtx flag is clear / on screen,
+     move   clipVOnsc, clipVNext               // therefore last vtx is set / off screen
+    move    clipVOnsc, clipVLastOfsc           // Otherwise swap; note we are overwriting
+    move    clipVLastOfsc, clipVNext           // clipVLastOfsc but not clipVNext
+clip_skipswap23:
+    // Interpolate between clipVLastOfs and clipVOns; create a new vertex which is on the
     // clipping boundary (e.g. at the screen edge)
 vClBaseF equ $v20
 vClBaseI equ $v21
@@ -1911,14 +1849,14 @@ vClFade2 equ $v2
     0 -Y :      Y1 + 2*W1         (Y1 + 2*W1) - (Y2 + 2*W2)
     */
     xori    $11, clipMaskIdx, 1              // Invert sign of condition
-    ldv     $v4[0], VTX_FRAC_VEC($19)        // Vtx on screen, frac pos
+    ldv     $v4[0], VTX_FRAC_VEC(clipVOnsc)  // Vtx on screen, frac pos
     ctc2    $11, $vcc                        // Conditions 1 (+y) or 3 (+x) -> vcc[0] = 0
-    ldv     $v5[0], VTX_INT_VEC ($19)        // Vtx on screen, int pos
+    ldv     $v5[0], VTX_INT_VEC (clipVOnsc)  // Vtx on screen, int pos
     vmrg    $v29, vOne, $v31[1]              // elem 0 is 1 if W or neg cond, -1 if pos cond
     andi    $11, clipMaskIdx, 4              // W condition and screen clipping
-    ldv     $v4[8], VTX_FRAC_VEC($3)         // Vtx off screen, frac pos
+    ldv     $v4[8], VTX_FRAC_VEC(clipVLastOfsc) // Vtx off screen, frac pos
     bnez    $11, clip_w              // If so, use 1 or -1
-     ldv    $v5[8], VTX_INT_VEC ($3)         // Vtx off screen, int pos
+     ldv    $v5[8], VTX_INT_VEC (clipVLastOfsc) // Vtx off screen, int pos
     vmudh   $v29, $v29, $v31[3]              // elem 0 is (1 or -1) * 2 (clip ratio)
     andi    $11, clipMaskIdx, 2              // Conditions 2 (-x) or 3 (+x)
     vmudm   vClBaseF, vOne, $v4[0h]          // Set accumulator (care about 3, 7) to X
@@ -1949,7 +1887,7 @@ clip_skipxy:
     vmudn   $v2, $v2, $v29[3]             // multiply reciprocal by +/- 2
     sh      outVtxBase, (clipPoly)(clipPolyWrite) // Write pointer to generated vertex to polygon
     vmadh   $v3, $v3, $v29[3]
-    lhu     $11, VTX_CLIP($3)             // Load clip flags for off screen vert
+    lhu     $11, VTX_CLIP(clipVLastOfsc)  // Load clip flags for off screen vert
     veq     $v3, $v3, $v31[2]             // 0; if reciprocal high is 0
     andi    fogFlag, vGeomMid, G_FOG >> 8 // Nonzero if fog enabled
     vmrg    $v2, $v2, $v31[1]             // keep reciprocal low, otherwise set to -1
@@ -1959,8 +1897,9 @@ clip_skipxy:
     vmadm   vClDiffI, vClDiffI, $v2[3]    // sum int * reciprocal, frac out
     li      $1, -1                        // $1 < 0 triggers last vtx loop iter
     vmadn   vClDiffF, $v31, $v31[2]       // 0; get int out
-    sh      $11, VTX_CLIP($3)             // Store modified clip flags for off screen vert
+    sh      $11, VTX_CLIP(clipVLastOfsc)  // Store modified clip flags for off screen vert
     vrcph   $v24[3], vClDiffI[3]          // reciprocal again (discard result)
+    TODO
 .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
     addi    outVtxBase, outVtxBase, -vtxSize // Inc'd by 2, must point to second vtx
 .endif
@@ -1984,25 +1923,25 @@ clip_skipxy:
     vmadn   $v23, $v23, vClDiffI          // * own reciprocal again? frac out
     vmadh   $v24, $v24, vClDiffI          // * own reciprocal again? int out
     vmudl   $v29, vClBaseF, $v23
-    ldv     $v6[0], VTX_FRAC_VEC($3)      // Vtx off screen, frac pos
+    ldv     $v6[0], VTX_FRAC_VEC(clipVLastOfsc)   // Vtx off screen, frac pos
     vmadm   $v29, vClBaseI, $v23
-    ldv     $v7[0], VTX_INT_VEC ($3)      // Vtx off screen, int pos
+    ldv     $v7[0], VTX_INT_VEC (clipVLastOfsc)   // Vtx off screen, int pos
     vmadn   vClDiffF, vClBaseF, $v24
-    luv     $v23[0], VTX_COLOR_VEC($3)    // Vtx off screen, RGBA
+    luv     $v23[0], VTX_COLOR_VEC(clipVLastOfsc) // Vtx off screen, RGBA
     vmadh   vClDiffI, vClBaseI, $v24      // 11:10 = vtx on screen sum * prev calculated value
-    luv     vPairRGBA[0], VTX_COLOR_VEC($19) // Vtx on screen, RGBA
+    luv     vPairRGBA[0], VTX_COLOR_VEC(clipVOnsc) // Vtx on screen, RGBA
     vmudl   $v29, vClDiffF, $v2[3]
-    llv     $v24[0], VTX_TC_VEC   ($3)    // Vtx off screen, ST
+    llv     $v24[0], VTX_TC_VEC   (clipVLastOfsc) // Vtx off screen, ST
     vmadm   vClDiffI, vClDiffI, $v2[3]
-    llv     vPairST[0], VTX_TC_VEC($19)   // Vtx on screen, ST
+    llv     vPairST[0], VTX_TC_VEC(clipVOnsc)     // Vtx on screen, ST
     vmadn   vClDiffF, $v31, $v31[2]       // End of computing vClDiff = vClBase / vClDiff
     vlt     vClDiffI, vClDiffI, vOne[0]   // If integer part of factor less than 1,
 .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
-    addi    $19, rdpCmdBufEndP1, vtxSize  // Fog writes up to one vtx behind
+    addi    outVtx1, rdpCmdBufEndP1, vtxSize  // Fog writes up to one vtx behind
 .endif
     vmrg    vClDiffF, vClDiffF, $v31[1]   // keep frac part of factor, else set to 0xFFFF (max val)
 .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
-    move    outVtx2, $19                  // Old last vtx regs = temp mem
+    move    outVtx2, outVtx1              // Old last vtx regs = temp mem
 .endif
     vsubc   $v29, vClDiffF, vOne[0]       // frac part - 1 for carry
     vge     vClDiffI, vClDiffI, $v31[2]   // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
@@ -2031,17 +1970,17 @@ clip_skipxy:
 .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
 clip_after_vtx_store:
     ori     $10, $10, CLIP_VTX_USED       // Mark generated vtx as used
-    slv     sSTS[0], (VTX_TC_VEC   )($19) // Store not-twice-scaled ST
-    sh      $10,     (VTX_CLIP     )($19) // Store generated vertex flags
+    slv     sSTS[0], (VTX_TC_VEC   )(outVtx1) // Store not-twice-scaled ST
+    sh      $10,     (VTX_CLIP     )(outVtx1) // Store generated vertex flags
 .endif
 clip_nextedge:
     bnez    clipFlags, clip_edgelooptop   // Discard V2 if it was off screen (whether inserted vtx or not)
-     move   $3, $2                        // Move what was the end of the edge to be the new start of the edge
+     move   clipVLastOfsc, clipVNext      // Move what was the end of the edge to be the new start of the edge
     sub     $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing
     addi    $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot
     bgez    $11, clip_done                // If so, give up
-     sh     $3, (clipPoly)(clipPolyWrite) // Former V2 was on screen, so add it to the output polygon
-    j       clip_edgelooptop
+     sh     clipVLastOfsc, (clipPoly)(clipPolyWrite) // Former V2 was on screen,
+    j       clip_edgelooptop              // so add it to the output polygon
      addi   clipPolyWrite, clipPolyWrite, 2
 
 clip_w:
@@ -2053,7 +1992,7 @@ clip_nextcond:
     sub     $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon?
     bltz    $11, clip_done                    // If so, degenerate result, quit
      sh     $zero, (clipPoly)(clipPolyWrite)  // Terminate the output polygon with a 0
-    lhu     $3, (clipPoly - 2)(clipPolyWrite) // Initialize the edge start (V3) to the last vert
+    lhu     clipVLastOfsc, (clipPoly - 2)(clipPolyWrite) // Initialize edge start to the last vert
     beqz    clipMaskIdx, clip_draw_tris
      lbu    $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount
     li      clipMask, 1
@@ -2082,7 +2021,6 @@ clip_done:
     sh      $11, activeClipPlanes
     lqv     $v30, (v30Value)($zero) // Need this repeated here in case we exited early
     lh      $ra, tempTriRA
-
 fill_vertex_table:
     // Create bytes 00-07
     li      $1, 7
@@ -2104,7 +2042,38 @@ fill_vertex_table:
     bne     $2, $3, @@loop2
      sqv    $v4[0], (-0x10)($2)
     jr      $ra
-     nop
+     // Delay slot harmless
+
+g_memset_ovl3:
+    llv     $v2[0], (rdpHalf1Val)($zero) // Load the memset value
+    sll     cmd_w0, cmd_w0, 8           // Clear upper byte
+    jal     segmented_to_physical
+     srl    cmd_w0, cmd_w0, 8           // Number of bytes to memset (must be mult of 16)
+    li      $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart
+    jal     @@clamp_to_memset_buffer
+     vmudh  $v2, vOne, $v2[1]           // Move element 1 (lower bytes) to all
+    addi    $2, $2, memsetBufferStart   // First qword set is one below end
+@@pre_loop:
+    sqv     $v2, (-0x10)($2)
+    bne     $2, $3, @@pre_loop
+     addi   $2, -0x10
+@@transaction_loop:
+    jal     @@clamp_to_memset_buffer
+     li     dmemAddr, 0x8000 | memsetBufferStart  // Always write from start of buffer
+    jal     dma_read_write
+     addi   dmaLen, $2, -1
+    sub     cmd_w0, cmd_w0, $2
+    bgtz    cmd_w0, @@transaction_loop
+     add    cmd_w1_dram, cmd_w1_dram, $2
+    j       wait_for_dma_and_run_next_command
+     // Delay slot harmless
+@@clamp_to_memset_buffer:
+    addi    $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize)
+    sra     $10, $11, 31
+    and     $11, $11, $10
+    jr      $ra
+     addi   $2, $11, memsetBufferSize
+
 
 ovl3_end:
 .align 8
@@ -2171,8 +2140,6 @@ vtx_after_dma:
     add     perfCounterA, perfCounterA, $11    // Add to vertex count
 .endif
 vtx_setup_constants:
-
-.if CFG_LEGACY_VTX_PIPE
     TODO check kept register assignments against xfrm_dir_lights
 
     // Computes modified viewport scale and offset including fog info, and stores
@@ -2199,16 +2166,16 @@ vtx_setup_constants:
 .if !CFG_NO_OCCLUSION_PLANE
     vmudh   sOPMs, sOPMs, $v31[5] // sOPMs is 0xC000, 0xC000, 0x4000, 0x4000, repeat
 .endif
-    llv     $v30[0], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 0, 1
+    lb      $11, geometryModeLabel + 3            // G_ATTROFFSET_ST_ENABLE in sign bit
     vmrg    sVPO, sVPO, $v23[1]                   // Put fog offset in elements 3,7 of vtrans
-    llv     $v30[8], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 4, 5
+    llv     $v30[0], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 0, 1
     vmov    sSTS[4], sSTS[0]
-    andi    $11, vGeomMid, G_ATTROFFSET_ST_ENABLE >> 8
+    llv     $v30[8], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 4, 5
     vmrg    sVPS, sVPS, $v23[0]                   // Put fog multiplier in elements 3,7 of vscale
-    bnez    $11, @@skipoffset
+    bltz    $11, @@keepoffset
      lbu    $7, mITValid
     vclr    $v30
-@@skipoffset:
+@@keepoffset:
 .if !CFG_NO_OCCLUSION_PLANE
     sqv     sOPMs, (tempOccPlusMinus)(rdpCmdBufEndP1) // Store occlusion plane -/+4000 constants
     sqv     sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Store viewport offset
@@ -2216,56 +2183,9 @@ vtx_setup_constants:
 .endif
     vmov    sSTS[5], sSTS[1]
     bgtz    $ra, clip_after_constants             // Return to clipping if from there
-     lsv    $v30[6], (perspNorm - altBase)(altBaseReg) // Perspective norm
-
-.else
-
-    // Computes modified viewport scale and offset including fog info, and stores
-    // these to temp memory in the RDP buffer. This is only used during vertex write
-    // and the first half of clipping, so that memory is not used then.
-    llv     $v23[0], (fogFactor)($zero)           // Load fog multiplier 0 and offset 1
-.if !CFG_NO_OCCLUSION_PLANE
-    vge     $v29, $v31, $v31[2h] // VCC = 00110011
-.endif
-    ldv     sVPO[0], (viewport + 8)($zero)        // Load vtrans duplicated in 0-3 and 4-7
-.if !CFG_NO_OCCLUSION_PLANE
-    vmrg    sOPMs, vOne, $v31[1] // Signs of sOPMs are --++--++
-.endif
-    ldv     sVPO[8], (viewport + 8)($zero)
-    vne     $v29, $v31, $v31[3h]                  // VCC = 11101110
-    ldv     sVPS[0], (viewport)($zero)            // Load vscale duplicated in 0-3 and 4-7
-    ldv     sVPS[8], (viewport)($zero)
-    lqv     $v30, (fxParams - altBase)(altBaseReg) // Parameters for vtx and lighting
-.if !CFG_NO_OCCLUSION_PLANE
-    vmudh   sOPMs, sOPMs, $v31[5] // sOPMs is 0xC000, 0xC000, 0x4000, 0x4000, repeat
-.endif
-    lw      $10, (geometryModeLabel)($zero)
-    vmrg    sVPO, sVPO, $v23[1]                   // Put fog offset in elements 3,7 of vtrans
-.if !CFG_NO_OCCLUSION_PLANE
-    sqv     sOPMs, (tempOccPlusMinus)(rdpCmdBufEndP1) // Store occlusion plane -/+4000 constants
-.endif
-    andi    $11, $10, G_AMBOCCLUSION
-    vmrg    sVPS, sVPS, $v23[0]                   // Put fog multiplier in elements 3,7 of vscale
-    bnez    $11, @@skipzeroao                     // Continue if AO disabled
-     sqv    sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Store viewport offset
-    vge     $v29, $v31, $v31[3]                   // VCC = 00011111
-    vmrg    $v30, $v30, $v31[2]                   // 0; zero AO values
-@@skipzeroao:
-    bgtz    $ra, clip_after_constants             // Return to clipping if from there
-     sqv    sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Store viewport scale
-
-.endif
-
+     lsv    $v30[6], (perspNorm - altBase)(altBaseReg) // Perspective norm elem 3
 vtx_after_setup_constants:
-    andi    $8, vGeomMid, G_LIGHTING >> 8        // Temp to be reused below, is outVtx2
-    beqz    $8, @@skip_lighting
-     li     $16, vtx_loop_no_lighting      // This is clipFlags, but not modified
-    li      $16, lt_vtx_pair               // during vtx_store
-@@skip_lighting:
-
-.if CFG_LEGACY_VTX_PIPE
-
-    bnez    $7, skip_vtx_mvp
+    bnez    $7, vtx_skip_multiply
      li     $2, vpMatrix
     li      $3, mMatrix
     j       mtx_multiply
@@ -2273,20 +2193,21 @@ vtx_after_setup_constants:
 vtx_after_mtx_multiply:
     sqv     $v5[0], (fourthQWMVP +    0)($zero)
     sb      $10, mITValid  // $10 is nonzero from mtx_multiply, in fact 0x18
-skip_vtx_mvp:
-    bnez    $8, ovl234_lighting_entrypoint      // Lighting setup, incl. transform
+vtx_skip_multiply:
+    lb      $8, pointLightFlagOrDirXfrmValid    // $8 = outVtx2
+    andi    $11, vGeomMid, G_LIGHTING >> 8
+    bnez    $11, vtx_select_lighting
      sb     $zero, materialCullMode             // Vtx ends material
+vtx_setup_no_lighting:
+    li      vLoopRet, vtx_loop_no_lighting
 vtx_after_lt_setup:
     lqv     vM0I,     (mITMatrix + 0x00)($zero)  // Load MVP matrix
     lqv     vM2I,     (mITMatrix + 0x10)($zero)
     lqv     vM0F,     (mITMatrix + 0x20)($zero)
     lqv     vM2F,     (fourthQWMVP +  0)($zero)
-.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC
-    addi    outVtxBase, outVtxBase, -vtxSize // Will inc by 2, but need point to 2nd
-.else
-    addi    outVtxBase, outVtxBase, -2*vtxSize // Going to increment this by 2 verts in loop
-.endif
+    andi    fogFlag, vGeomMid, G_FOG >> 8    // Nonzero if fog enabled
     vcopy   vM1I,  vM0I
+    srl     fogFlag, fogFlag, 5  // 8 if G_FOG is set, 0 otherwise
     vcopy   vM3I,  vM2I
     ldv     vM1I[0],  (mITMatrix + 0x08)($zero)
     vcopy   vM1F,  vM0F
@@ -2298,55 +2219,13 @@ vtx_after_lt_setup:
     ldv     vM2I[8],  (mITMatrix + 0x10)($zero)
     ldv     vM0F[8],  (mITMatrix + 0x20)($zero)
     ldv     vM2F[8],  (fourthQWMVP +  0)($zero)
+.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC
+    addi    outVtxBase, outVtxBase, -vtxSize // Will inc by 2, but need point to 2nd
 .else
-    sb      $zero, materialCullMode            // Vtx ends material
-    lqv     vM0I,     (mMatrix + 0x00)($zero)  // Load M matrix
-    lqv     vM2I,     (mMatrix + 0x10)($zero)
-    lqv     vM0F,     (mMatrix + 0x20)($zero)
-    lqv     vM2F,     (mMatrix + 0x30)($zero)
-    lbu     $11, mITValid                      // 0 if matrix invalid, 1 if valid
-    vcopy   vM1I,  vM0I
-    lbu     $10, normalsMode                   // bit 0 clear if don't compute mIT, set if do
-    vcopy   vM3I,  vM2I
-    ldv     vM1I[0],  (mMatrix + 0x08)($zero)
-    vcopy   vM1F,  vM0F
-    ldv     vM3I[0],  (mMatrix + 0x18)($zero)
-    vcopy   vM3F,  vM2F
-    ldv     vM1F[0],  (mMatrix + 0x28)($zero)
-    sltiu   $11, $11, 1                        // 0 if matrix valid, 1 if invalid
-    srl     $7, vGeomMid, 9                     // G_LIGHTING in bit 1
-    and     $7, $7, $11                        // If lighting enabled and need to update matrix,
-    and     $7, $7, $10                        // and computing mIT,
-    ldv     vM3F[0],  (mMatrix + 0x38)($zero)
-    ldv     vM0I[8],  (mMatrix + 0x00)($zero)
-    ldv     vM2I[8],  (mMatrix + 0x10)($zero)
-    ldv     vM0F[8],  (mMatrix + 0x20)($zero)
-    bnez    $7, ovl234_ovl4_entrypoint         // run overlay 4 to compute M inverse transpose
-     ldv    vM2F[8],  (mMatrix + 0x30)($zero)
-vtx_after_calc_mit:
-    lqv     vVP0I,    (vpMatrix  + 0x00)($zero)
-    lqv     vVP2I,    (vpMatrix  + 0x10)($zero)
-    lqv     vVP0F,    (vpMatrix  + 0x20)($zero)
-    lqv     vVP2F,    (vpMatrix  + 0x30)($zero)
     addi    outVtxBase, outVtxBase, -2*vtxSize // Going to increment this by 2 verts in loop
-    vcopy   vVP1I, vVP0I
-    vcopy   vVP3I, vVP2I
-    ldv     vVP1I[0], (vpMatrix  + 0x08)($zero)
-    vcopy   vVP1F, vVP0F
-    ldv     vVP3I[0], (vpMatrix  + 0x18)($zero)
-    vcopy   vVP3F, vVP2F
-    ldv     vVP1F[0], (vpMatrix  + 0x28)($zero)
-    ldv     vVP3F[0], (vpMatrix  + 0x38)($zero)
-    ldv     vVP0I[8], (vpMatrix  + 0x00)($zero)
-    ldv     vVP2I[8], (vpMatrix  + 0x10)($zero)
-    ldv     vVP0F[8], (vpMatrix  + 0x20)($zero)
-    ldv     vVP2F[8], (vpMatrix  + 0x30)($zero)
 .endif
-    andi    fogFlag, vGeomMid, G_FOG >> 8    // Nonzero if fog enabled
-.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
-    srl     fogFlag, fogFlag, 5  // 8 if G_FOG is set, 0 otherwise
     addi    outVtx1, rdpCmdBufEndP1, vtxSize  // Temp mem; fog writes up to vtxSize before
-    jal     while_wait_dma_busy   // Wait for vertex load to finish
+    bgezal  vLoopRet, while_wait_dma_busy  // Wait for vertex load to finish; vLoopRet < 0 if already did
      move   outVtx2, outVtx1     // for first pre-loop, same for outVtx2
     ldv     vPairPosI[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // 1st vec pos
     ldv     vPairPosI[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // 2nd vec pos
@@ -2355,15 +2234,8 @@ vtx_after_calc_mit:
     llv     vPairST[0],   (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1
     j       vtx_store_loop_entry
      llv    vPairST[8],   (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5
-.else
-    jal     while_wait_dma_busy   // Wait for vertex load to finish
-     addi   outVtx1, rdpCmdBufEndP1, tempPrevVtxGarbage  // Temp mem we can freely overwrite replaces outVtxBase
-    j       vtx_store_loop_entry
-     move   outVtx2, outVtx1     // for first pre-loop, same for outVtx2
-.endif
-
-.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
 
+// TODO
 // $v0:$v7 = MVP, $v8:$v10 = sVPS/sVPO/sSTS, $v11 = available, $v12 = sFGM,
 // $v13 = first light dir, $v14:$v16 = Y/Z/vPairNrml/temp, $v17 = vPairLt/temp,
 // $v18:$v19 = available, $v20:$v21 = vPairPosI/F/temp,
@@ -2374,9 +2246,6 @@ vtx_after_calc_mit:
 
 .if CFG_NO_OCCLUSION_PLANE
 
-
-
-
 vtx_loop_no_lighting:
     vmadh   $v29, vM1I, vPairPosI[1h]
     andi    $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
@@ -2736,366 +2605,11 @@ vertex_end:
 
 .endif
 
-.else // end of new LVP_NOC
 
-.if CFG_LEGACY_VTX_PIPE
-vtx_early_return_from_lighting:
-    vmrg    vPairRGBA, vPairLt, vPairRGBA  // RGB = light, A = vtx alpha
-.endif
-vtx_loop_no_lighting:
-vtx_return_from_lighting:
-    li      $ra, vertex_end
-.if CFG_LEGACY_VTX_PIPE
-    vmudm   vPairST, vPairST, sSTS      // Scale ST; must be after texgen
-@@skipsecond:
-.else
-    vclr    sSTO
-    andi    $11, vGeomMid, G_ATTROFFSET_ST_ENABLE >> 8
-    vmudn   $v29, vVP3F, vOne
-    beqz    $11, @@skipoffset
-     vmadh  $v29, vVP3I, vOne
-    llv     sSTO[0], (attrOffsetST - altBase)(altBaseReg) // elems 0, 1 = S, T offset
-    llv     sSTO[8], (attrOffsetST - altBase)(altBaseReg) // elems 4, 5 = S, T offset
-@@skipoffset:
-    vmadl   $v29, vVP0F, vPairPosF[0h]
-    llv     sSTS[0], (textureSettings2)($zero)  // Texture ST scale in 0, 1
-    vmadm   $v29, vVP0I, vPairPosF[0h]
-    llv     sSTS[8], (textureSettings2)($zero)  // Texture ST scale in 4, 5
-    vmadn   $v29, vVP0F, vPairPosI[0h]
-    vmadh   $v29, vVP0I, vPairPosI[0h]
-    vmadl   $v29, vVP1F, vPairPosF[1h]
-    vmadm   $v29, vVP1I, vPairPosF[1h]
-    vmadn   $v29, vVP1F, vPairPosI[1h]
-    vmadh   $v29, vVP1I, vPairPosI[1h]
-    vmadl   $v29, vVP2F, vPairPosF[2h]
-    vmadm   $v29, vVP2I, vPairPosF[2h]
-    vmadn   vPairTPosF, vVP2F, vPairPosI[2h]
-    vmadh   vPairTPosI, vVP2I, vPairPosI[2h]
-    vmudm   $v29, vPairST, sSTS         // Scale ST; must be after texgen
-    vmadh   vPairST, sSTO, vOne         // + 1 * (ST offset or zero)
-.endif
-    addi    outVtxBase, outVtxBase, 2*vtxSize
-vtx_store_for_clip:
-    // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA
-    // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and
-    // vPairRGBA can be used as temps once stored ($v22, $v27).
-    // Scalar regs: outVtx2, outVtxBase; set to the same thing if only write 1 vtx
-    // temps $10, $11, $20, $24
-    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
-    move    outVtx2, outVtxBase          // Second and output vertices write to same mem...
-    vmadm   s1WI, vPairTPosI, $v30[3] // Persp norm
-    bltz    vtxLeft, @@skipsecond        // ...if < 0 verts remain, ...
-     vmadn  s1WF, $v31, $v31[2] // 0
-    addi    outVtx2, outVtxBase, vtxSize // ...otherwise, second vtx is next vtx
-@@skipsecond:
-    vch     $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
-    suv     vPairRGBA[4],     (VTX_COLOR_VEC )(outVtx2)
-    vcl     $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
-    suv     vPairRGBA[0],     (VTX_COLOR_VEC )(outVtxBase)
-    vrcph   $v29[0], s1WI[3]
-    cfc2    $10, $vcc // Load screen clipping results
-    vrcpl   sRTF[2], s1WF[3]
-    sdv     vPairTPosF[8],    (VTX_FRAC_VEC  )(outVtx2)
-    vrcph   sRTI[3], s1WI[7]
-    move    outVtx1, outVtxBase  // Else outVtx1 is initialized to temp memory on first pre-loop
-    vrcpl   sRTF[6], s1WF[7]
-    sdv     vPairTPosF[0],    (VTX_FRAC_VEC  )(outVtxBase)
-    vrcph   sRTI[7], $v31[2] // 0
-    sdv     vPairTPosI[8],    (VTX_INT_VEC   )(outVtx2)
-    vmudn   sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping
-    sdv     vPairTPosI[0],    (VTX_INT_VEC   )(outVtxBase)
-    vmadh   sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping
-    slv     vPairST[8],       (VTX_TC_VEC    )(outVtx2)
-    vmudl   $v29, s1WF, sRTF[2h]
-    slv     vPairST[0],       (VTX_TC_VEC    )(outVtxBase)
-    vmadm   $v29, s1WI, sRTF[2h]
-
-.if CFG_NO_OCCLUSION_PLANE
-    vmadn   s1WF, s1WF, sRTI[3h]
-    addi    inVtx, inVtx, 2*inputVtxSize
-    vmadh   s1WI, s1WI, sRTI[3h]
-vtx_store_loop_entry:
-// vPairST is $v22
-    ldv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inVtx) // ST in 0:1, RGBA in 2:3
-    vch     $v29, vPairTPosI, sSCI[3h] // Clip scaled high
-    ldv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inVtx) // ST in 4:5, RGBA in 6:7
-    vmudh   $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7
-    lsv     vPairTPosI[14], (VTX_Z_INT     )(outVtx2) // load Z into W slot, will be for fog below
-    vmadn   s1WF, s1WF, $v31[0] // -4
-    lsv     vPairTPosI[6],  (VTX_Z_INT     )(outVtx1) // load Z into W slot, will be for fog below
-    vmadh   s1WI, s1WI, $v31[0] // -4
-    srl     $24, $10, 4            // Shift second vertex screen clipping to first slots
-    vcl     $v29, vPairTPosF, sSCF[3h] // Clip scaled low
-    andi    $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-// sTCL is $v21
-    vcopy   sTCL, vPairST
-    cfc2    $20, $vcc // Load scaled clipping results
-    vmudl   $v29, s1WF, sRTF[2h]
-    lsv     vPairTPosF[14], (VTX_Z_FRAC    )(outVtx2) // load Z into W slot, will be for fog below
-    vmadm   $v29, s1WI, sRTF[2h]
-    lsv     vPairTPosF[6],  (VTX_Z_FRAC    )(outVtx1) // load Z into W slot, will be for fog below
-    vmadn   s1WF, s1WF, sRTI[3h]
-// vPairPosI is $v20
-    ldv     vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inVtx)
-    vmadh   s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W
-    ldv     vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inVtx)
-    vmov    sTCL[4], vPairST[2]
-    andi    $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-    vmov    sTCL[5], vPairST[3]
-    ori     $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
-    vmudl   $v29, vPairTPosF, s1WF[3h]
-    ssv     s1WF[14],         (VTX_INV_W_FRAC)(outVtx2)
-    vmadm   $v29, vPairTPosI, s1WF[3h]
-    ssv     s1WF[6],          (VTX_INV_W_FRAC)(outVtx1)
-    vmadn   vPairTPosF, vPairTPosF, s1WI[3h]
-    ssv     s1WI[14],         (VTX_INV_W_INT )(outVtx2)
-    vmadh   vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W
-    ssv     s1WI[6],          (VTX_INV_W_INT )(outVtx1)
-    // vnop
-    sdv     sTCL[8],      (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA
-    // vnop
-.if CFG_LEGACY_VTX_PIPE
-    lpv     $v14[7],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4
-.else
-// sVPO is $v17 // vtx_store ViewPort Offset
-    lqv     sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset
-.endif
-    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
-.if CFG_LEGACY_VTX_PIPE
-    lpv     $v15[6],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4
-.else
-// sVPS is $v26 // vtx_store ViewPort Scale
-    lqv     sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale
-.endif
-    vmadm   vPairTPosI, vPairTPosI, $v30[3] // Persp norm
-// vPairRGBA is $v27
-    luv     vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
-    vmadn   vPairTPosF, $v31, $v31[2] // 0
-    sll     $11, $20, 4            // Shift first vertex scaled clipping to second slots
-.if !CFG_LEGACY_VTX_PIPE
-// sTPN is $v16
-    vmov    sTPN[2], vPairPosI[7]  // Move vtx 1 packed normals to elem 2
-.endif
-    andi    $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
-.if !CFG_LEGACY_VTX_PIPE
-    vmov    sTPN[0], vPairPosI[3]  // Move vtx 0 packed normals to elem 0
-.endif
-    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
-    vmudh   $v29, sVPO, vOne // offset * 1
-    or      $24, $24, $20          // Combine results for second vertex
-    vmadn   vPairTPosF, vPairTPosF, sVPS // + XYZ * scale
-    or      $10, $10, $11          // Combine results for first vertex
-    vmadh   vPairTPosI, vPairTPosI, sVPS
-    sh      $24,              (VTX_CLIP      )(outVtx2) // Store second vertex clip flags
-// sFOG is $v25
-    vmadh   sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
-.if !CFG_LEGACY_VTX_PIPE
-    sdv     sTPN[0],          (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals
-.endif
-    // vnop
-    sh      $10,              (VTX_CLIP      )(outVtx1)          // Store first vertex results
-// vPairNrml is $v16
-    vmudn   vPairNrml, vPairRGBA, $v31[3] // 2; left shift RGBA without clamp; vtx pair normals
-    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(outVtx2)
-// sCLZ is $v21 // vtx_store CLamped Z
-    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
-    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)(outVtx1)
-    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
-    slv     vPairTPosI[8],    (VTX_SCR_VEC   )(outVtx2)
-    vmudn   $v29, vM3F, vOne
-    slv     vPairTPosI[0],    (VTX_SCR_VEC   )(outVtx1)
-    vmadh   $v29, vM3I, vOne
-    blez    vtxLeft, skip_return_to_lt_or_loop  // $ra left as vertex_end or clipping
-     vmadn  $v29, vM0F, vPairPosI[0h]
-    move    $ra, $16                    // Normally $ra = loop or lighting
-skip_return_to_lt_or_loop:
-    vmadh   $v29, vM0I, vPairPosI[0h]
-    addi    vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
-    vmadn   $v29, vM1F, vPairPosI[1h]
-    ssv     sCLZ[12],         (VTX_SCR_Z     )(outVtx2)
-    vmadh   $v29, vM1I, vPairPosI[1h]
-    ssv     sCLZ[4],          (VTX_SCR_Z     )(outVtx1)
-// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23
-    vmadn   sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords
-    beqz    fogFlag, return_and_end_mat // fog disabled
-// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24
-     vmadh  sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords
-    sbv     sFOG[15],         (VTX_COLOR_A   )(outVtx2)
-    jr      $ra
-     sbv    sFOG[7],          (VTX_COLOR_A   )(outVtx1)
-    
-.else // CFG_NO_OCCLUSION_PLANE
-    
-// sOCM is $v22 // vtx_store OCclusion Mid, $v22 = vPairST
-    ldv     sOCM[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
-    vmadn   s1WF, s1WF, sRTI[3h]
-    ldv     sOCM[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
-    vmadh   s1WI, s1WI, sRTI[3h]
-    srl     $24, $10, 4            // Shift second vertex screen clipping to first slots
-    vch     $v29, vPairTPosI, sSCI[3h] // Clip scaled high
-    andi    $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-    vcl     $v29, vPairTPosF, sSCF[3h] // Clip scaled low
-    andi    $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-    vmudh   $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7
-    cfc2    $20, $vcc // Load scaled clipping results
-    vmadn   s1WF, s1WF, $v31[0] // -4
-    ori     $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
-    vmadh   s1WI, s1WI, $v31[0] // -4
-    addi    inVtx, inVtx, 2*inputVtxSize
-    vmudn   $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz
-    vmadh   $v29, vPairTPosI, sOCM // Int * int
-    lsv     vPairTPosF[14], (VTX_Z_FRAC    )(outVtx2) // load Z into W slot, will be for fog below
-// sOC1 is $v21 // vtx_store OCclusion temp 1
-    vreadacc sOC1, ACC_UPPER // Load int * int portion
-    lsv     vPairTPosF[6],  (VTX_Z_FRAC    )(outVtxBase) // load Z into W slot, will be for fog below
-    vmudl   $v29, s1WF, sRTF[2h]
-    lsv     vPairTPosI[14], (VTX_Z_INT     )(outVtx2) // load Z into W slot, will be for fog below
-    vmadm   $v29, s1WI, sRTF[2h]
-    lsv     vPairTPosI[6],  (VTX_Z_INT     )(outVtxBase) // load Z into W slot, will be for fog below
-    vmadn   s1WF, s1WF, sRTI[3h]
-    sll     $11, $20, 4            // Shift first vertex scaled clipping to second slots
-    vmadh   s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W
-    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
-    veq     $v29, $v31, $v31[3h] // Set VCC to 00010001
-    blez    vtxLeft, skip_return_to_lt_or_loop  // $ra left as vertex_end or clipping
-     vmrg   sOC1, sOCM, sOC1  // Put constant factor in elems 3, 7
-vtx_store_loop_entry:
-    move    $ra, $16                    // Normally $ra = loop or lighting
-skip_return_to_lt_or_loop:
-    vmudl   $v29, vPairTPosF, s1WF[3h]  // W must be overwritten with Z before here
-    ssv     s1WF[14],         (VTX_INV_W_FRAC)(outVtx2)
-    vmadm   $v29, vPairTPosI, s1WF[3h]
-    ssv     s1WF[6],          (VTX_INV_W_FRAC)(outVtx1)
-    vmadn   vPairTPosF, vPairTPosF, s1WI[3h]
-    ssv     s1WI[14],         (VTX_INV_W_INT )(outVtx2)
-    vmadh   vPairTPosI, vPairTPosI, s1WI[3h] // pos * 1/W
-    ssv     s1WI[6],          (VTX_INV_W_INT )(outVtx1)
-    vadd    sOC1, sOC1, sOC1[0q] // Add pairs upwards
-.if !CFG_LEGACY_VTX_PIPE
-// sVPO is $v17 // vtx_store ViewPort Offset
-    lqv     sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset
-.endif
-    // vnop
-.if CFG_LEGACY_VTX_PIPE
-    addi    vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
-.else
-// sVPS is $v16 // vtx_store ViewPort Scale
-    lqv     sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale
-.endif
-    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
-// vPairST is $v22
-    ldv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inVtx) // ST in 0:1, RGBA in 2:3
-    vmadm   vPairTPosI, vPairTPosI, $v30[3] // Persp norm
-    ldv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inVtx) // ST in 4:5, RGBA in 6:7
-    vmadn   vPairTPosF, $v31, $v31[2] // 0
-// vPairPosI is $v20
-    ldv     vPairPosI[0],      (VTX_IN_OB + inputVtxSize * 0)(inVtx)
-    vadd    sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7
-    ldv     vPairPosI[8],      (VTX_IN_OB + inputVtxSize * 1)(inVtx)
-    // vnop
-// sO03 is $v26 // vtx_store Occlusion coeffs 0-3
-    ldv     sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
-    vmudh   $v29, sVPO, vOne // offset * 1
-    ldv     sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2
-    vmadn   vPairTPosF, vPairTPosF, sVPS // + XYZ * scale
-.if !CFG_LEGACY_VTX_PIPE
-// sOPM is $v17 // vtx_store Occlusion Plus Minus constants
-    lqv     sOPM, (tempOccPlusMinus)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants
-.endif
-    vmadh   vPairTPosI, vPairTPosI, sVPS
-    andi    $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
-// sFOG is $v16
-    vmadh   sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
-    or      $10, $10, $11          // Combine results for first vertex
-    vlt     $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7
-    slv     vPairST[4],   (tempVpRGBA + 0)(rdpCmdBufEndP1) // Store vtx 0 RGBA to temp mem
-.if !CFG_LEGACY_VTX_PIPE
-// sTPN is $v18
-    vmov    sTPN[2], vPairPosI[7]  // Move vtx 1 packed normals to elem 2
-.endif
-    slv     vPairST[12],  (tempVpRGBA + 4)(rdpCmdBufEndP1) // Store vtx 1 RGBA to temp mem
-.if !CFG_LEGACY_VTX_PIPE
-    vmov    sTPN[0], vPairPosI[3]  // Move vtx 0 packed normals to elem 0
-.endif
-    cfc2    $11, $vcc // Load occlusion plane mid results to bits 3 and 7
-// sOSC is $v21 // vtx_store Occlusion SCaled up
-    vmudh   sOSC, vPairTPosI, $v31[4] // 4; scale up x and y
-    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(outVtx2)
-    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
-    or      $24, $24, $20          // Combine results for second vertex
-// sCLZ is $v25 // vtx_store CLamped Z
-    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
-    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)(outVtx1)
-    vmulf   $v29, sOPM, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
-// sO47 is $v23 // vtx_store Occlusion coeffs 0-3; $v23 = vPairTPosF
-    ldv     sO47[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7
-// sOC2 is $v27 // vtx_store OCclusion temp 2; $v27 = vPairRGBA
-    vmacf   sOC2, sO03, sOSC[0h]       //    4*X1*c0, --,    4*X1*c2, --, repeat vtx 2
-    ldv     sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2
-    vmulf   $v29, sOPM, vPairTPosI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
-    beqz    fogFlag, @@skipfog // fog disabled
-// sOC3 is $v21 // vtx_store OCclusion temp 3
-     vmacf  sOC3, sO03, sOSC[1h]       // --,    4*Y1*c1, --,    4*Y1*c3, repeat vtx 2
-    sbv     sFOG[15],         (VTX_COLOR_A   )(outVtx2)
-    sbv     sFOG[7],          (VTX_COLOR_A   )(outVtx1)
-@@skipfog:
-    slv     vPairTPosI[8],    (VTX_SCR_VEC   )(outVtx2)
-    veq     $v29, $v31, $v31[0q]       // Set VCC to 10101010
-    slv     vPairTPosI[0],    (VTX_SCR_VEC   )(outVtx1)
-    vmrg    sOC2, sOC2, sOC3           // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
-.if CFG_LEGACY_VTX_PIPE
-    lpv     $v14[7],          (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4
-.else
-    sdv     sTPN[0],          (tempVpPkNorm)(rdpCmdBufEndP1) // Vtx 0 and 1 packed normals
-.endif
-    // vnop
-    ssv     sCLZ[12],         (VTX_SCR_Z     )(outVtx2)
-    // vnop
-.if CFG_LEGACY_VTX_PIPE
-    lpv     $v15[6],          (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4
-.else
-    addi    vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
-.endif
-    // vnop
-    ssv     sCLZ[4],          (VTX_SCR_Z     )(outVtx1)
-    vge     $v29, sOC2, sO47           // Each compare to coeffs 4-7
-// vPairNrml is $v16
-    lpv     vPairNrml[0],     (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals
-    vmudn   $v29, vM3F, vOne
-    cfc2    $20, $vcc
-    vmadh   $v29, vM3I, vOne
-// vPairRGBA is $v27
-    luv     vPairRGBA[0],     (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair colors
-    vmadn   $v29, vM0F, vPairPosI[0h]
-    andi    $11, $11, (1 << 7) | (1 << 3) // Only bits 3, 7 from occlusion
-    vmadh   $v29, vM0I, vPairPosI[0h]
-    or      $20, $20, $11    // Combine occlusion results. Any set in 0-3, 4-7 = not occluded
-    vmadn   $v29, vM1F, vPairPosI[1h]
-    andi    $11, $20, 0x00F0 // Bits 4-7 for vtx 2
-    vmadh   $v29, vM1I, vPairPosI[1h]
-    bnez    $11, @@skipv2    // If nonzero, at least one equation false, don't set occluded flag
-     andi   $20, $20, 0x000F // Bits 0-3 for vtx 1
-    ori     $24, $24, CLIP_OCCLUDED // All equations true, set vtx 2 occluded flag
-@@skipv2:
-// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23
-    vmadn   sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords
-    bnez    $20, @@skipv1    // If nonzero, at least one equation false, don't set occluded flag
-     sh     $24,              (VTX_CLIP      )(outVtx2) // Store second vertex clip flags
-    ori     $10, $10, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag
-@@skipv1:    
-// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24
-    vmadh   sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords
-    jr      $ra
-     sh     $10,              (VTX_CLIP      )(outVtx1)          // Store first vertex results
-
-.endif // CFG_NO_OCCLUSION_PLANE
-
-.endif // New LVP_NOC
-
-.if !CFG_PROFILING_A && (!CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE)
-vertex_end:
-    j      run_next_DL_command
-     lqv   $v30, (v30Value)($zero)           // Restore value overwritten in vtx_store
+.if !CFG_PROFILING_A && (!CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE)
+vertex_end:
+    j      run_next_DL_command
+     lqv   $v30, (v30Value)($zero)           // Restore value overwritten in vtx_store
 .endif
 
 .if CFG_PROFILING_A
@@ -3117,11 +2631,45 @@ tris_end:
      add    perfCounterD, perfCounterD, $11  // Add to tri cycles perf counter
 .endif
 
-.if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE
 G_MTX_end:
-    instantiate_mtx_end_begin
+// Multiplies the temp loaded matrix into the M or VP matrix
+    lhu     $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
+    li      $3, tempMatrix // Input 1 = temp mem (loaded mtx)
+    jal     while_wait_dma_busy
+     move   $2, $6 // Input 0 = output
 mtx_multiply:
-    instantiate_mtx_multiply
+    // $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx
+    addi    $10, $3, 0x0018
+@@loop:
+    vmadn   $v7, $v31, $v31[2]  // 0
+    addi    $11, $3, 0x0008
+    vmadh   $v6, $v31, $v31[2]  // 0
+    addi    $2, $2, -0x0020
+    vmudh   $v29, $v31, $v31[2] // 0
+@@innerloop:
+    ldv     $v3[0], 0x0040($2)
+    ldv     $v3[8], 0x0040($2)
+    lqv     $v1[0], 0x0020($3) // Input 1
+    ldv     $v2[0], 0x0020($2)
+    ldv     $v2[8], 0x0020($2)
+    lqv     $v0[0], 0x0000($3) // Input 1
+    vmadl   $v29, $v3, $v1[0h]
+    addi    $3, $3, 0x0002
+    vmadm   $v29, $v2, $v1[0h]
+    addi    $2, $2, 0x0008 // Increment input 0 pointer
+    vmadn   $v5, $v3, $v0[0h]
+    bne     $3, $11, @@innerloop
+     vmadh  $v4, $v2, $v0[0h]
+    bne     $3, $10, @@loop
+     addi   $3, $3, 0x0008
+    sqv     $v7[0], (0x0020)($6)
+    sqv     $v6[0], (0x0000)($6)
+.if CFG_LEGACY_VTX_PIPE
+    beqz    $7, vtx_after_mtx_multiply
+.endif
+     sqv    $v4[0], (0x0010)($6)
+    j       run_next_DL_command
+     sqv    $v5[0], (0x0030)($6)
 .endif
 
 
@@ -3486,42 +3034,29 @@ ovl1_padded_end:
 .headersize ovl234_start - orga()
 
 ovl2_start:
-// Lighting overlay.
+// Basic lighting overlay.
 
-// Jump here to do lighting. If overlay 2 is loaded (this code), jumps into the
+// Jump here for basic lighting setup. If overlay 2 is loaded (this code), jumps into the
 // rest of the lighting code below.
-ovl234_lighting_entrypoint:
-.if !CFG_LEGACY_VTX_PIPE
-lt_vtx_pair:
-.endif
+ovl234_ltbasic_entrypoint:
 .if CFG_PROFILING_B
-.if CFG_LEGACY_VTX_PIPE
-    nop
-.else
-    addi    perfCounterA, perfCounterA, 2    // Increment lit vertex count by 2
-.endif
-.endif
-    j       lt_continue_setup
-.if CFG_LEGACY_VTX_PIPE
-     lbu    ambLight, numLightsxSize
-.else
-     andi   $11, vGeomMid, G_PACKED_NORMALS >> 8
+    nop                                    // Needs to take up the space for the other perf counter
 .endif
+    j       ltbasic_continue_setup
+     and    $11, $8, $7                    // $8=7F lts valid, $7=18 mtx valid
 
-.if !CFG_LEGACY_VTX_PIPE
-// Jump here for all overlay 4 features. If overlay 2 is loaded (this code), loads
+// Jump here for advanced lighting. If overlay 2 is loaded (this code), loads
 // overlay 4 and jumps to right here, which is now in the new code.
-ovl234_ovl4_entrypoint_ovl2ver:            // same IMEM address as ovl234_ovl4_entrypoint
+ovl234_ltadv_entrypoint_ovl2ver:           // same IMEM address as ovl234_ltadv_entrypoint
 .if CFG_PROFILING_B
     addi    perfCounterD, perfCounterD, 1  // Count overlay 4 load
 .endif
     jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
      li     cmd_w1_dram, orga(ovl4_start)  // set up a load for overlay 4
-.endif //!CFG_LEGACY_VTX_PIPE
 
-// Jump here to do clipping. If overlay 2 is loaded (this code), loads overlay 3
+// Jump here for clipping and rare commands. If overlay 2 is loaded (this code), loads overlay 3
 // and jumps to right here, which is now in the new code.
-ovl234_clipping_entrypoint_ovl2ver:        // same IMEM address as ovl234_clipping_entrypoint
+ovl234_clipmisc_entrypoint_ovl2ver:        // same IMEM address as ovl234_clipmisc_entrypoint
     sh      $ra, tempTriRA                 // Tri return after clipping
 .if CFG_PROFILING_B
     addi    perfCounterD, perfCounterD, 0x4000  // Count clipping overlay load
@@ -3529,20 +3064,9 @@ ovl234_clipping_entrypoint_ovl2ver:        // same IMEM address as ovl234_clippi
     jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
      li     cmd_w1_dram, orga(ovl3_start)  // set up a load for overlay 3
 
-lt_continue_setup:
-.if CFG_LEGACY_VTX_PIPE
-//
-// LVP lighting setup
-//
-    llv     $v30[12], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 6, 7
-    TODO
-    lb      $11, dirLightsXfrmValid
-    li      $10, -1                   // To mark lights valid
-    addi    ambLight, ambLight, altBase         // Point to ambient light; stored through vtx proc
-    andi    $17, vGeomMid, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store
-    and     $11, $11, $7              // Zero if either matrix or lights invalid
-    bnez    $11, lt_setup_after_xfrm
-     sb     $10, dirLightsXfrmValid
+ltbasic_continue_setup:
+    bnez    $11, ltbasic_setup_after_xfrm  // Skip if lights and matrix were valid
+     addi   ambLight, ambLight, altBase    // Point to ambient light; stored through vtx proc
 xfrm_dir_lights:
     // Transform directional lights' direction by M transpose.
     // First, load M transpose. Can use any regs except $v8-$v12, $v28-$v31.
@@ -3586,7 +3110,7 @@ xfrm_dir_lights:
     vmudh   $v17, vOne, $v17[1q]              // N - T - N - T -
     li      curLight, altBase - 4 * lightSize // + ltBufOfs = light -4; write pointer
     vmrg    $v1, $v1, $v5                     // B E H - B E H -
-    // nop
+    li      $11, 0x7F                         // Mark lights valid
     // Interleave the start of transforming pairs of dir lights, including lookat.
     vmrg    $v16, $v16, $v20[0q]              // M P S - M P S -
     swv     $v18[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores O R U - O R U -
@@ -3599,6 +3123,7 @@ xfrm_dir_lights:
     vmadh   $v29, $v1,  $v3[1h]
     lqv     $v2,     (tempXfrmSingle)(rdpCmdBufEndP1)
     vmadn   $v29, $v16, $v3[0h]
+    sb      $11, pointLightFlagOrDirXfrmValid
     // 18 cycles
 xfrm_light_loop_1:
     vmadn   $v29, $v18, $v3[2h]
@@ -3637,15 +3162,44 @@ xfrm_light_loop_2:
      vmudh  $v29, $v0,  $v3[0h]
      // 20 cycles from xfrm_light_loop_2 not counting land
     vmadh   $v29, $v1,  $v3[1h]
-    bgtz    $11, lt_setup_after_xfrm // curLight > ambient; only one light valid
+    bgtz    $11, ltbasic_setup_after_xfrm // curLight > ambient; only one light valid
      sw     $20, (ltBufOfs + 0xC - 2 * lightSize)(curLight) // Write light relative -2
     vmadn   $v29, $v16, $v3[0h]
     bltz    $11, xfrm_light_loop_1   // curLight < ambient; more lights to compute
      sw     $24, (ltBufOfs + 0xC - 1 * lightSize)(curLight) // Write light relative -1
-lt_setup_after_xfrm:
-    // Load first light direction to $v13, which is not used throughout vtx processing.
+ltbasic_setup_after_xfrm:
+    lpv     $v13[0], (ltBufOfs + 8 - lightSize)(ambLight) // First lt xfrmed dir in elems 4-6
+    li      vLoopRet, ltbasic_start_standard
+    andi    $11, vGeomMid, (G_AMBOCCLUSION | G_PACKED_NORMALS | G_LIGHTTOALPHA | G_TEXTURE_GEN) >> 8
+    beqz    $11, vtx_after_lt_setup  // None of the above features enabled
+     li     lbAfter, vtx_return_from_lighting
+    andi    $11, $11, ~(G_TEXTURE_GEN >> 8)
+    beqz    $11, vtx_after_lt_setup  // Zero = only texgen enabled
+     li     lbAfter, ltbasic_texgen
+    andi    $11, $11, ~(G_LIGHTTOALPHA >> 8)
+    beqz    $11, vtx_after_lt_setup  // Zero = L2A (and maybe texgen) enabled, but AO and PN not
+     li     lbAfter, ltbasic_l2a
+    // AO and/or packed are enabled
+    andi    lbPacked, vGeomMid, G_PACKED_NORMALS >> 8
+    beqz    lbPacked, @@skip_packed
+     andi   lbL2A, vGeomMid, G_LIGHTTOALPHA >> 8
+    // $v13[0:3] = [0xF800, 0xFC00, (1 << 11) = 0x0800, (1 << 5) = 0x0020]
+    lpv     $v20[0], (packedConstants - altBase)(altBaseReg) // Elems 0-2 above
+    lqv     $v21, (v30Value)($zero)  // Sadly 0x0020 was in element 4 of $v30, already overwritten
+    vlt     $v29, $v31, $v31[3] // Set VCC to 11100000
+    li      lbAfter, ltbasic_no_l2a
+    vmrg    $v13, $v20, $v13 // Consts in elems 0-2, first lt dir in elems 4-6
+    vmov    $v13[3], $v21[4] // 0x0020 constant to element 3
+@@skip_packed:
+    beqz    lbL2A, @@skip_l2a
+     andi   lbAO, vGeomMid, G_AMBOCCLUSION >> 8
+    li      lbAfter, ltbasic_l2a
+@@skip_l2a:
+    beqz    lbAO, vtx_after_lt_setup
+     li     vLoopRet, ltbasic_start_packed_ao
+    llv     $v30[12], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 6, 7
     j       vtx_after_lt_setup
-     lpv    $v13[0], (ltBufOfs + 8 - lightSize)(ambLight) // Xfrmed dir in elems 4-6
+     li     lbAfter, ltbasic_ao
     
 xfrm_light_store_lookat:
     vmadh   $v29, $v1,  $v3[1h]
@@ -3654,7 +3208,7 @@ xfrm_light_store_lookat:
     j       xfrm_light_loop_2
      vmadn  $v29, $v18, $v3[2h]
 
-// Light loop contents
+// Lighting within vertex loop
 
 .if CFG_NO_OCCLUSION_PLANE
 
@@ -3706,38 +3260,36 @@ xfrm_light_store_lookat:
 
 .endif
 
-// $11 can be used as a temporary, except between instan_lt_scl_1 and instan_lt_scl_2
+.align 8
 
-TODO $2 sign bit set if AO enabled
-TODO LTTEMP is some reg maybe $11
+// If lighting, vLoopRet = ltbasic_start_packed_ao if packed or AO, else ltbasic_start_standard
 
-.align 8
-lt_start_packed_ao:
+ltbasic_start_packed_ao:
     instan_lt_vec_1
-    beqz    TODO_PACKED_NORMALS, lt_start_ao
+    beqz    lbPacked, ltbasic_start_ao    // Go to AO only if packed normals disabled
      instan_lt_vec_2
     instan_lt_vec_3
     luv     vPairLt,     (ltBufOfs + 0)(ambLight)  // Total light level, init to ambient
-    vmudn   $v14, vPairPosI, $v13[1]      // (1 << 5); left shift normals Y
-    sra     LTTEMP, $2, 31                // All 1s if AO enabled
-    vand    vPairNrml, vPairPosI, $v13[2] // 0xF800; mask X to only top 5 bits
-    nor     LTTEMP, LTTEMP, $zero         // All 1s if AO disabled
-    vmudn   $v15, vPairPosI, $v13[0]      // (1 << 11); left shift normals Z
-    ctc2    LTTEMP, $vcc                  // Set VCC to all 1s if AO disabled, else all 0s
+    vmudn   $v14, vPairPosI, $v13[3]      // (1 << 5) = 0x20; left shift normals Y
+    sra     $11, TODO, 31             // All 1s if AO enabled
+    vand    vPairNrml, vPairPosI, $v13[0] // 0xF800; mask X to only top 5 bits
+    nor     $11, $11, $zero               // All 1s if AO disabled
+    vmudn   $v15, vPairPosI, $v13[2]      // (1 << 11) = 0x0800; left shift normals Z
+    ctc2    $11, $vcc                     // Set VCC to all 1s if AO disabled, else all 0s
     vmrg    vPairLt, vPairLt, $v31[2]     // 0; clear vPairLt if AO enabled
-    j       lt_after_start
-     vand   $v14, $v14, $v13[3]           // 0xFC00; mask Y to only top 6 bits
+    j       ltbasic_after_start
+     vand   $v14, $v14, $v13[1]           // 0xFC00; mask Y to only top 6 bits
 
-lt_start_ao:
+ltbasic_start_ao:
     lpv     vPairNrml[3],     (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7
     instan_lt_vec_3
     lpv     $v15[1],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 3, 7
     vclr    vPairLt
-    j       lt_after_start
+    j       ltbasic_after_start
      lpv    $v14[2],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 3, 7
 
 .align 8
-lt_start_standard:
+ltbasic_start_standard:
     instan_lt_vec_1
     lpv     vPairNrml[3],     (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7
     instan_lt_vec_2
@@ -3746,7 +3298,7 @@ lt_start_standard:
     lpv     $v14[2],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 3, 7
     // vnop
     luv     vPairLt,     (ltBufOfs + 0)(ambLight)  // Total light level, init to ambient
-lt_after_start:
+ltbasic_after_start:
 .if CFG_PROFILING_B
     addi    perfCounterA, perfCounterA, 2    // Increment lit vertex count by 2
 .endif
@@ -3755,13 +3307,13 @@ lt_after_start:
     vmacf   $v29, $v15, $v13[6] // Normals Z elems 3, 7 * first light dir
     luv     vDDD[0],          (tempVpRGBA)(rdpCmdBufEndP1) // Load RGBA
     vmacf   vAAA, $v14, $v13[5] // Normals Y elems 3, 7 * first light dir
-    instan_lt_scl_1
+    instan_lt_scl_1  // $11 can be used as a temporary, except b/w instan_lt_scl_1...
     // vnop
-    beq     ambLight, altBaseReg, lt_post
-     instan_lt_scl_2
+    beq     ambLight, altBaseReg, ltbasic_post
+     instan_lt_scl_2 // ...and instan_lt_scl_2
     // vnop
     move    curLight, ambLight                   // Point to ambient light
-lt_loop:
+ltbasic_loop:
     vge     vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0
     vmulf   $v29, vPairNrml, vPairRGBA[4] // Normals X elems 3, 7 * next light dir
     luv     vBBB,        (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color
@@ -3770,47 +3322,54 @@ lt_loop:
     vmacf   vAAA, $v15, vPairRGBA[6] // Normals Z elems 3, 7 * next light dir
     lpv     vPairRGBA[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6; DOES dual-issue
     vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
-    bne     curLight, altBaseReg, lt_loop
+    bne     curLight, altBaseReg, ltbasic_loop
      vmacf  vPairLt, vBBB, vCCC[3h] // + light color * dot product
-lt_post:
+ltbasic_post:
     instan_lt_vs_45  // Starts using vBBB
     vne     $v29, $v31, $v31[3h]           // Set VCC to 11101110
-    beqz    TODO_NO_EXTRAS, vtx_return_from_lighting
+    jr      lbAfter
      vmrg   vPairRGBA, vPairLt, vDDD  // RGB = light, A = vtx alpha
-    // AO
+     
+// lbAfter = ltbasic_ao if AO else
+//           ltbasic_l2a if L2A else
+//           ltbasic_no_l2a if packed else
+//           ltbasic_texgen if texgen else
+//           vtx_return_from_lighting
+ vLtRGBOut equ vPairRGBA
+ vLtAOut   equ vAAA
+     
+ltbasic_ao:
     vsub    vPairRGBA, vDDD, $v31[7] // 0x7FFF; offset alpha
-    bgez    $2, lt_skip_ao
-     vmudh  $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
+    vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
     vmadm   vCCC, vPairRGBA, $v30[7] // + (alpha - 1) * aoDir factor; elems 3, 7
     luv     vAAA,     (ltBufOfs + 0)(ambLight)  // Ambient light level
     vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
     vmadm   vPairRGBA, vPairRGBA, $v30[6] // + (alpha - 1) * aoAmb factor; elems 3, 7
     vmulf   $v29, vPairLt, vCCC[3h] // Sum of dir lights *= dir factor
-    vmacf   vPairLt, vAAA, vPairRGBA[3h] // + ambient * amb factor
-lt_skip_ao:
-vLtRGBOut equ vPairRGBA
-vLtAOut   equ vAAA
-    vmulf   vLtRGBOut, vDDD, vPairLt       // RGB output is RGB * light
-    beqz    TODO, lt_skip_cel
-     vcopy  vLtAOut, vDDD                  // Alpha output = vertex alpha (only 3, 7 matter)
-    // Cel: alpha = max of light components, RGB = vertex color
+    beqz    lbL2A, ltbasic_no_l2a
+     vmacf  vPairLt, vAAA, vPairRGBA[3h] // + ambient * amb factor
+ltbasic_l2a:
+    // Light-to-alpha (cel shading): alpha = max of light components, RGB = vertex color
     vge     vLtAOut, vPairLt, vPairLt[1h]  // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
     vge     vLtAOut, vLtAOut, vLtAOut[2h]  // elem 0 = max(R0, G0, B0); equiv for elem 4
     vcopy   vLtRGBOut, vDDD                // RGB output is vertex color
-    vmudh   vLtAOut, vOne, vLtAOut[0h]     // move light level elem 0, 4 to 3, 7
-lt_skip_cel:
     vne     $v29, $v31, $v31[3h]           // Set VCC to 11101110
-    bnez    TODO, lt_skip_novtxcolor
-     lpv    ltLookAt[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
+    j       ltbasic_common_end
+     vmudh  vLtAOut, vOne, vLtAOut[0h]     // move light level elem 0, 4 to 3, 7
+    
+ltbasic_no_l2a:
+    vcopy   vLtAOut, vDDD                  // Alpha output (only 3, 7 matter) = vertex alpha 
+    vmulf   vLtRGBOut, vDDD, vPairLt       // RGB output is RGB * light
+ltbasic_common_end:
+    bnez    lbPacked, @@skip               // If packed normals, skip
+     andi   $11, vGeomMid, G_TEXTURE_GEN >> 8
     vcopy   vLtRGBOut, vPairLt             // If no packed normals, base output is just light
-lt_skip_novtxcolor:
-    // vnop
-    beqz    TODO, vtx_return_from_lighting
+@@skip:
+    beqz    $11, vtx_return_from_lighting
      vmrg   vPairRGBA, vLtRGBOut, vLtAOut  // Merge base output and alpha output
-
-
-    
+ltbasic_texgen:
 // Texgen uses vLookat0:1 = vPairLt and vAAA, vCCC:vDDD, and of course vPairST.
+    lpv     ltLookAt[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
     vmulf   $v29, vPairNrml, ltLookAt[0] // Normals X elems 0, 4 * lookat 0 X
     vmacf   $v29, $v14, ltLookAt[1]      // Normals Y elems 0, 4 * lookat 0 Y
     vmacf   vLookat0, $v15, ltLookAt[2]  // Normals Z elems 0, 4 * lookat 0 Z
@@ -3820,10 +3379,53 @@ lt_skip_novtxcolor:
     // Continue to rest of texgen shared by both versions.
     TODO now the results are in 3h instead of 0h
 
-.endif // CFG_LEGACY_VTX_PIPE
-    
-    
-.if !CFG_LEGACY_VTX_PIPE
+ovl2_end:
+.align 8
+ovl2_padded_end:
+
+.headersize ovl234_start - orga()
+
+ovl4_start:
+// Advanced lighting overlay.
+
+// Jump here for basic lighting setup. If overlay 4 is loaded (this code), loads overlay 2
+// and jumps to right here, which is now in the new code.
+ovl234_ltbasic_entrypoint_ovl4ver:         // same IMEM address as ovl234_ltbasic_entrypoint
+.if CFG_PROFILING_B
+    addi    perfCounterC, perfCounterC, 1  // Count lighting overlay load
+.endif
+    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
+     li     cmd_w1_dram, orga(ovl2_start)  // set up a load for overlay 2
+     
+// Jump here for advanced lighting. If overlay 4 is loaded (this code), jumps
+// to the instruction selection below.
+ovl234_ltadv_entrypoint:
+.if CFG_PROFILING_B
+    nop                                    // Needs to take up the space for the other perf counter
+.endif
+    j       ltadv_continue
+     nop    // TODO
+
+// Jump here for clipping and rare commands. If overlay 4 is loaded (this code), loads overlay 3
+// and jumps to right here, which is now in the new code.
+ovl234_clipmisc_entrypoint_ovl4ver:        // same IMEM address as ovl234_clipmisc_entrypoint
+    sh      $ra, tempTriRA                 // Tri return after clipping
+.if CFG_PROFILING_B
+    addi    perfCounterD, perfCounterD, 0x4000  // Count clipping overlay load
+.endif
+    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
+     li     cmd_w1_dram, orga(ovl3_start)  // set up a load for overlay 3
+
+ltadv_continue:
+    // TODO
+    nop
+    nop
+    nop
+    j vtx_after_lt_setup
+     nop
+
+/*
+
 //
 // F3DEX3 native lighting
 //
@@ -4056,14 +3658,14 @@ lt_skip_fresnel:
     
 .if !CFG_LEGACY_VTX_PIPE
 lt_point:
-    /*
+    comment
     Input vector 1 elem size 7FFF.0000 -> len^2 3FFF0001 -> 1/len 0001.0040 -> vec +801E.FFC0 -> clamped 7FFF
         len^2 * 1/len = 400E.FFC1 so about half actual length
     Input vector 1 elem size 0100.0000 -> len^2 00010000 -> 1/len 007F.FFC0 -> vec  7FFF.C000 -> clamped 7FFF
         len^2 * 1/len = 007F.FFC0 so about half actual length
     Input vector 1 elem size 0010.0000 -> len^2 00000100 -> 1/len 07FF.FC00 -> vec  7FFF.C000
     Input vector 1 elem size 0001.0000 -> len^2 00000001 -> 1/len 7FFF.C000 -> vec  7FFF.C000
-    */
+    
     ldv     vAAA[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
     ldv     vAAA[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
 lt_normal_to_vertex:
@@ -4146,219 +3748,8 @@ lt_normalize:
     vmadm   vBBB, vAAA, $v29[1h] // Vec int * frac scaling, discard result
     jr      $ra
      vmadh  vNrmOut, vAAA, $v29[0h] // Vec int * int scaling
-.endif
-
-ovl2_end:
-.align 8
-ovl2_padded_end:
-
-.headersize ovl234_start - orga()
-
-ovl4_start:
-
-.if !CFG_LEGACY_VTX_PIPE
-
-// Contains M inverse transpose (mIT) computation, and some rarely-used command handlers.
-
-// Jump here to do lighting. If overlay 4 is loaded (this code), loads overlay 2
-// and jumps to right here, which is now in the new code.
-ovl234_lighting_entrypoint_ovl4ver:        // same IMEM address as ovl234_lighting_entrypoint
-.if CFG_PROFILING_B
-    addi    perfCounterC, perfCounterC, 1  // Count lighting overlay load
-.endif
-    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
-     li     cmd_w1_dram, orga(ovl2_start)  // set up a load for overlay 2
-     
-// Jump here for all overlay 4 features. If overlay 4 is loaded (this code), jumps
-// to the instruction selection below.
-ovl234_ovl4_entrypoint:
-.if !CFG_NO_OCCLUSION_PLANE
-G_MTX_end:
-.endif
-.if CFG_PROFILING_B
-    nop                                    // Needs to take up the space for the other perf counter
-.endif
-    j       ovl4_select_instr
-     lw     cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Overwritten by overlay load
 
-// Jump here to do clipping. If overlay 4 is loaded (this code), loads overlay 3
-// and jumps to right here, which is now in the new code.
-ovl234_clipping_entrypoint_ovl4ver:        // same IMEM address as ovl234_clipping_entrypoint
-    sh      $ra, tempTriRA                 // Tri return after clipping
-.if CFG_PROFILING_B
-    addi    perfCounterD, perfCounterD, 0x4000  // Count clipping overlay load
-.endif
-    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
-     li     cmd_w1_dram, orga(ovl3_start)  // set up a load for overlay 3
-
-ovl4_select_instr:
-.if !CFG_NO_OCCLUSION_PLANE
-    li      $2, (0xFF00 | G_MTX)
-    beq     $2, $7, g_mtx_end_ovl4
-.endif
-     li     $3, G_BRANCH_WZ
-    beq     $3, $7, g_branch_wz_ovl4
-     li     $2, (0xFF00 | G_DMA_IO)
-    beq     $2, $7, g_dma_io_ovl4
-     li     $3, (0xFF00 | G_MEMSET)
-    beq     $3, $7, g_memset_ovl4
-     // Otherwise calc_mit. Delay slot is harmless.
-
-calc_mit:
-    /*
-    Compute M inverse transpose. All regs available except vM0I::vM3F, $v30 (fxParams),
-    and $v31 constants.
-    Register use (all only elems 0-2):
-    $v8:$v9   X left rotated int:frac, $v10:$v11 X right rotated int:frac
-    $v12:$v13 Y left rotated int:frac, $v14:$v15 Y right rotated int:frac
-    $v16:$v17 Z left rotated int:frac, $v18:$v19 Z right rotated int:frac
-    Rest temps.
-    Scale factor can be arbitrary, but final matrix must only reduce a vector's
-    magnitude (rotation * scale < 1). So want components of matrix to be < 0001.0000.
-    However, if input matrix has components on the order of 0000.0100, multiplying
-    two terms will reduce that to the order of 0000.0001, which kills all the precision.
-    */
-    // Get absolute value of all terms of M matrix.
-    li      $10, mMatrix + 0xE                               // For right rotates with lrv/ldv
-    vxor    $v20, vM0I, $v31[1] // One's complement of X int part
-    sb      $7, mITValid                                     // $7 is 1 if we got here, mark valid
-    vlt     $v29, vM0I, $v31[2] // X int part < 0
-    li      $11, mMatrix + 2                                 // For left rotates with lqv/ldv
-    vabs    $v21, vM0I, vM0F    // Apply sign of X int part to X frac part
-    lrv     $v10[0], (0x00)($10)                              // X int right shifted
-    vxor    $v22, vM1I, $v31[1] // One's complement of Y int part
-    lrv     $v11[0], (0x20)($10)                              // X frac right shifted
-    vmrg    $v20, $v20, vM0I    // $v20:$v21 = abs(X int:frac)
-    lqv     $v16[0], (0x10)($11)                              // Z int left shifted
-    vlt     $v29, vM1I, $v31[2] // Y int part < 0
-    lqv     $v17[0], (0x30)($11)                              // Z frac left shifted
-    vabs    $v23, vM1I, vM1F    // Apply sign of Y int part to Y frac part
-    lsv     $v10[0], (0x02)($11)                              // X int right rot elem 2->0
-    vxor    $v24, vM2I, $v31[1] // One's complement of Z int part
-    lsv     $v11[0], (0x22)($11)                              // X frac right rot elem 2->0
-    vmrg    $v22, $v22, vM1I    // $v22:$v23 = abs(Y int:frac)
-    lsv     $v16[4],  (0x0E)($11)                             // Z int left rot elem 0->2
-    vlt     $v29, vM2I, $v31[2] // Z int part < 0
-    lsv     $v17[4],  (0x2E)($11)                             // Z frac left rot elem 0->2
-    vabs    $v25, vM2I, vM2F    // Apply sign of Z int part to Z frac part
-    lrv     $v18[0], (0x10)($10)                              // Z int right shifted
-    vmrg    $v24, $v24, vM2I    // $v24:$v25 = abs(Z int:frac)
-    lrv     $v19[0], (0x30)($10)                              // Z frac right shifted
-    // See if any of the int parts are nonzero. Also, get the maximum of the frac parts.
-    vge     $v21, $v21, $v23
-    lqv     $v8[0],  (0x00)($11)                              // X int left shifted
-    vor     $v20, $v20, $v22
-    lqv     $v9[0],  (0x20)($11)                              // X frac left shifted
-    vmudn   $v11, $v11, $v31[1] // -1; negate X right rot
-    lsv     $v18[0], (0x12)($11)                              // Z int right rot elem 2->0
-    vmadh   $v10, $v10, $v31[1]
-    lsv     $v19[0], (0x32)($11)                              // Z frac right rot elem 2->0
-    vge     $v21, $v21, $v25
-    lsv     $v8[4],  (-0x02)($11)                             // X int left rot elem 0->2
-    vor     $v20, $v20, $v24
-    lsv     $v9[4],  (0x1E)($11)                              // X frac left rot elem 0->2
-    vmudn   $v17, $v17, $v31[1] // -1; negate Z left rot
-    ldv     $v12[0], (0x08)($11)                              // Y int left shifted
-    vmadh   $v16, $v16, $v31[1]
-    ldv     $v13[0], (0x28)($11)                              // Y frac left shifted
-    vge     $v21, $v21, $v21[1h]
-    ldv     $v14[0], (-0x08)($10)                             // Y int right shifted
-    vor     $v20, $v20, $v20[1h]
-    ldv     $v15[0], (0x18)($10)                              // Y frac right shifted
-    vmudn   $v27, $v19, $v31[1] // -1; $v26:$v27 is negated copy of Z right rot
-    lsv     $v12[4], (0x06)($11)                              // Y int left rot elem 0->2
-    vmadh   $v26, $v18, $v31[1]
-    lsv     $v13[4], (0x26)($11)                              // Y frac left rot elem 0->2
-    vge     $v21, $v21, $v21[2h]
-    lsv     $v14[0], (0x0A)($11)                              // Y int right rot elem 2->0
-    vor     $v20, $v20, $v20[2h]
-    lsv     $v15[0], (0x2A)($11)                              // Y frac right rot elem 2->0
-    // Scale factor is 1/(2*(max^2)) (clamped if overflows).
-    // 1/(2*max) is what vrcp provides, so we multiply that by 2 and then by the rcp
-    // output. If we used the scale factor of 1/(max^2), the output matrix would have
-    // components on the order of 0001.0000, but we want the components to be smaller than this.
-    vrcp    $v25[1], $v21[0] // low in, low out (discarded)
-    vrcph   $v25[0], $v31[2] // zero in, high out (only care about elem 0)
-    vadd    $v22, $v25, $v25 // *2
-    vmudh   $v25, $v22, $v25 // (1/max) * (1/(2*max)), clamp to 0x7FFF
-    veq     $v29, $v20, $v31[2] // elem 0 (all int parts) == 0
-    vmrg    $v25, $v25, vOne // If so, use computed normalization, else use 1 (elem 0)
-    /*
-    The original equations for the matrix rows are (XL = X rotated left, etc., n = normalization):
-    n*(YL*ZR - YR*ZL)
-    n*(ZL*XR - ZR*XL)
-    n*(XL*YR - XR*YL)
-    We need to apply the normalization to one of each of the terms before the multiply,
-    and also there's no multiply-subtract instruction, only multiply-add. Converted to:
-    (n*YL)*  ZR  + (n*  YR )*(-ZL)
-    (n*XL)*(-ZR) + (n*(-XR))*(-ZL)
-    (n*XL)*  YR  + (n*(-XR))*  YL
-    So the steps are:
-    Negate XR, negate ZL, negated copy of ZR (all done above)
-    Scale XL, scale negated XR
-    Do multiply-adds for Y and Z output vectors
-    Scale YL, scale YR
-    Do multiply-adds for X output vector
-    */
-    vmudn   $v9,  $v9,  $v25[0] // Scale XL
-    vmadh   $v8,  $v8,  $v25[0]
-    vmudn   $v11, $v11, $v25[0] // Scale XR
-    vmadh   $v10, $v10, $v25[0]
-    // Z output vector: XL*YR + XR*YL, with each term having had scale and/or negative applied
-    vmudl   $v29, $v9,  $v15
-    vmadm   $v29, $v8,  $v15
-    vmadn   $v29, $v9,  $v14
-    vmadh   $v29, $v8,  $v14
-    vmadl   $v29, $v11, $v13
-    vmadm   $v29, $v10, $v13
-    vmadn   $v21, $v11, $v12
-    vmadh   $v20, $v10, $v12 // $v20:$v21 = Z output
-    vmudn   $v13, $v13, $v25[0] // Scale YL
-    vmadh   $v12, $v12, $v25[0]
-    vmudn   $v15, $v15, $v25[0] // Scale YR
-    vmadh   $v14, $v14, $v25[0]
-    // Y output vector: XL*ZR + XR*ZL, with each term having had scale and/or negative applied
-    vmudl   $v29, $v9,  $v27 // Negated copy of ZR
-    vmadm   $v29, $v8,  $v27
-    vmadn   $v29, $v9,  $v26
-    vmadh   $v29, $v8,  $v26
-    sdv     $v21[0], (mITMatrix + 0x28)($zero)
-    vmadl   $v29, $v11, $v17
-    sdv     $v20[0], (mITMatrix + 0x10)($zero)
-    vmadm   $v29, $v10, $v17
-    vmadn   $v21, $v11, $v16
-    vmadh   $v20, $v10, $v16 // $v20:$v21 = Y output
-    // X output vector: YL*ZR + YR*ZL, with each term having had scale and/or negative applied
-    vmudl   $v29, $v13, $v19
-    vmadm   $v29, $v12, $v19
-    vmadn   $v29, $v13, $v18
-    vmadh   $v29, $v12, $v18
-    sdv     $v21[0], (mITMatrix + 0x20)($zero)
-    vmadl   $v29, $v15, $v17
-    sdv     $v20[0], (mITMatrix + 0x08)($zero)
-    vmadm   $v29, $v14, $v17
-    vmadn   $v21, $v15, $v16
-    vmadh   $v20, $v14, $v16 // $v20:$v21 = X output
-    sdv     $v21[0], (mITMatrix + 0x18)($zero)
-    j       vtx_after_calc_mit
-     sdv    $v20[0], (mITMatrix + 0x00)($zero)
-
-.if !CFG_NO_OCCLUSION_PLANE
-g_mtx_end_ovl4:
-    instantiate_mtx_end_begin
-    instantiate_mtx_multiply
-.endif
-
-g_branch_wz_ovl4:
-    instantiate_branch_wz
-
-g_dma_io_ovl4:
-    instantiate_dma_io
-    
-g_memset_ovl4:
-    instantiate_memset
-
-.endif // !CFG_LEGACY_VTX_PIPE
+*/
 
 ovl4_end:
 .align 8
diff --git a/gbi.h b/gbi.h
index 44c67d9..ad794e3 100644
--- a/gbi.h
+++ b/gbi.h
@@ -140,8 +140,8 @@ of warnings if you use -Wpedantic. */
 #define G_ZBUFFER               0x00000001
 #define G_TEXTURE_ENABLE        0x00000000  /* actually 2, but controlled by SPTexture */
 #define G_SHADE                 0x00000004
-#define G_AMBOCCLUSION          0x00000040
-#define G_ATTROFFSET_ST_ENABLE  0x00000100
+#define G_ATTROFFSET_ST_ENABLE  0x00000080
+#define G_AMBOCCLUSION          0x00000100
 #define G_CULL_NEITHER          0x00000000
 #define G_CULL_FRONT            0x00000200
 #define G_CULL_BACK             0x00000400
@@ -157,7 +157,7 @@ of warnings if you use -Wpedantic. */
 #define G_TEXTURE_GEN_LINEAR    0x00080000
 #define G_LOD                   0x00100000  /* Ignored by all F3DEX* variants */
 #define G_SHADING_SMOOTH        0x00200000
-#define G_LIGHTING_POSITIONAL   0x00400000  /* Ignored by F3DEX3, assumed always on */
+#define G_LIGHTING_POSITIONAL   0x00400000  /* In F3DEX3, replaced by ENABLE_POINT_LIGHTS */
 #define G_CLIPPING              0x00800000  /* Ignored by all F3DEX* variants */
 
 /* See SPDisplayList / SPBranchList */
@@ -3237,8 +3237,15 @@ _DW({                                               \
  * Lighting Commands
  */
 
+/**
+ * OR this flag into n in SPNumLights or SPSetLights* to indicate that one or
+ * more of the lights are point lights.
+ * Example: gSPSetLights(POLY_OPA_DISP++, numLights | ENABLE_POINT_LIGHTS, *lights);
+ */
+#define ENABLE_POINT_LIGHTS (0x8000 >> 4)
+
 #define NUML(n)    ((n) * 0x10)
-/*
+/**
  * F3DEX3 properly supports zero lights, so there is no need to use these macros
  * anymore.
  */
@@ -3265,7 +3272,7 @@ _DW({                                               \
 #define gsSPNumLights(n)                                \
     gsMoveWd(    G_MW_NUMLIGHT, G_MWO_NUMLIGHT, NUML(n))
 
-/* There is also no need to use these macros. */
+/** There is also no need to use these macros. */
 #define LIGHT_1     1
 #define LIGHT_2     2
 #define LIGHT_3     3
diff --git a/rsp/gbi.inc b/rsp/gbi.inc
index 787af29..c1e5b7e 100644
--- a/rsp/gbi.inc
+++ b/rsp/gbi.inc
@@ -7,8 +7,8 @@
 G_ZBUFFER               equ 0x00000001
 //G_TEXTURE_ENABLE      equ 0x00000002
 G_SHADE                 equ 0x00000004
-G_AMBOCCLUSION          equ 0x00000040
-G_ATTROFFSET_ST_ENABLE  equ 0x00000100
+G_ATTROFFSET_ST_ENABLE  equ 0x00000080
+G_AMBOCCLUSION          equ 0x00000100
 // Bits 9 and 10 (0x0600) determine front/backface culling.
 G_CULL_NEITHER          equ 0x00000000
 G_CULL_FRONT            equ 0x00000200