-
Notifications
You must be signed in to change notification settings - Fork 7
/
f3dex3.s
3956 lines (3716 loc) · 182 KB
/
f3dex3.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.rsp
.include "rsp/rsp_defs.inc"
.include "rsp/gbi.inc"
// This file assumes DATA_FILE and CODE_FILE are set on the command line
.if version() < 110
.error "armips 0.11 or newer is required"
.endif
.macro li, reg, imm
addi reg, $zero, imm
.endmacro
.macro move, dst, src
ori dst, src, 0
.endmacro
// Prohibit macros involving slt; this silently clobbers $1. You can of course
// manually write the slt and branch instructions if you want this behavior.
.macro blt, ra, rb, lbl
.error "blt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bgt, ra, rb, lbl
.error "bgt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro ble, ra, rb, lbl
.error "ble is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bge, ra, rb, lbl
.error "bge is a macro using slt, and silently clobbers $1!"
.endmacro
// This version doesn't depend on $v0 to be vZero, which it usually is not in
// F3DEX3, and also doesn't get corrupted if $vco is set / consume $vco which
// may be needed for a subsequent instruction.
.macro vcopy, dst, src
vor dst, src, src
.endmacro
// Using $v31 instead of dst as the source because $v31 doesn't change, whereas
// dst might have been modified 2 or 3 cycles ago, causing a stall.
.macro vclr, dst
vxor dst, $v31, $v31
.endmacro
// Also using $v31 for the dummy args here to avoid stalls. dst was once written
// in vanilla tri code just before reading (should have been $v29), leading to
// stalls!
ACC_UPPER equ 0
ACC_MIDDLE equ 1
ACC_LOWER equ 2
.macro vreadacc, dst, N
vsar dst, $v31, $v31[N]
.endmacro
//
// Profiling configurations. To make space for the profiling features, if any of
// the profiling configurations are enabled, G_LIGHTTORDP and !G_SHADING_SMOOTH
// are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth
// shaded.
//
// Profiling Configuration A
// perfCounterA:
// cycles RSP spent processing vertex commands (incl. vertex DMAs)
// perfCounterB:
// upper 16 bits: fetched DL command count
// lower 16 bits: DL command count
// perfCounterC:
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// cycles RSP spent processing triangle commands, NOT including buffer flushes
.if CFG_PROFILING_A
.if CFG_PROFILING_B || CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 1
// Profiling Configuration B
// perfCounterA:
// upper 16 bits: vertex count
// lower 16 bits: lit vertex count
// perfCounterB:
// upper 18 bits: tris culled by occlusion plane count
// lower 14 bits: clipped (input) tris count
// perfCounterC:
// upper 18 bits: overlay (all 0-4) load count
// lower 14 bits: overlay 2 (lighting) load count
// perfCounterD:
// upper 18 bits: overlay 3 (clipping) load count
// lower 14 bits: overlay 4 (misc) load count
.elseif CFG_PROFILING_B
.if CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 1
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 0
// Profiling Configuration C
// perfCounterA:
// cycles RSP believes it was running (this ucode only)
// perfCounterB:
// upper 16 bits: samples GCLK was alive (sampled once per DL command count)
// lower 16 bits: DL command count
// perfCounterC:
// upper 18 bits: small RDP command count (all RDP cmds except tris)
// lower 14 bits: matrix loads count
// perfCounterD:
// cycles RSP was stalled waiting for miscellaneous DMAs to finish
.elseif CFG_PROFILING_C
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 0
// Default (extra profiling disabled)
// perfCounterA:
// upper 16 bits: vertex count
// lower 16 bits: RDP/out tri count
// perfCounterB:
// upper 18 bits: RSP/in tri count
// lower 14 bits: tex/fill rect count
// perfCounterC:
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// unused/zero
.else
ENABLE_PROFILING equ 0
COUNTER_A_UPPER_VERTEX_COUNT equ 1
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 1
.endif
/*
There are two different memory spaces for the overlays: (a) IMEM and (b) the
microcode file (which, plus an offset, is also the location in DRAM).
A label marks both an IMEM addresses and a file address, but evaluating the
label in an integer context (e.g. in a branch) gives the IMEM address.
`orga(your_label)` gets the file address of the label, and `.orga` sets the
file address.
`.headersize`, as well as the value after `.create`, sets the difference
between IMEM addresses and file addresses, so you can set the IMEM address
with `.headersize desired_imem_addr - orga()`.
In IMEM, the whole microcode is organized as (each row is the same address):
0x80 space | |
for boot code Overlay 0 Overlay 1
(End (More cmd
start task) handlers)
(initialization) | |
Many command
handlers
Overlay 2 Overlay 3 Overlay 4
(Lighting) (Clipping) (mIT, rare cmds)
Vertex and
tri handlers
DMA code
In the file, the microcode is organized as:
start (file addr 0x0 = IMEM 0x1080)
Many command handlers
Overlay 3
Vertex and tri handlers
DMA code (end of this = IMEM 0x2000 = file 0xF80)
Overlay 0
Overlay 1
Overlay 2
Overlay 4
*/
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// DMEM //////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// RSP DMEM
.create DATA_FILE, 0x0000
/*
Matrices are stored and used in a transposed format compared to how they are
normally written in mathematics. For the integer part:
00 02 04 06 typical Xscl Rot Rot 0
08 0A 0C 0E use: Rot Yscl Rot 0
10 12 14 16 Rot Rot Zscl 0
18 1A 1C 1E Xpos Ypos Zpos 1
The fractional part comes next and is in the same format.
Applying this transformation is done by multiplying a row vector times the
matrix, like:
X Y Z 1 * Xscl Rot Rot 0 = NewX NewY NewZ 1
Rot Yscl Rot 0
Rot Rot Zscl 0
Xpos Ypos Zpos 1
In C, the matrix is accessed as matrix[row][col], and the vector is vector[row].
*/
// 0x0000-0x0040: model matrix
mMatrix:
.fill 64
// 0x0040-0x0080: view * projection matrix
vpMatrix:
.fill 64
// model inverse transpose matrix; first three rows only
mITMatrix:
.fill 0x30
fogFactor:
.dw 0x00000000
textureSettings1:
.dw 0x00000000 // first word, has command byte, level, tile, and on
textureSettings2:
.dw 0xFFFFFFFF // second word, has s and t scale
geometryModeLabel:
.dw 0x00000000 // originally initialized to G_CLIPPING, but that does nothing
.if . != 0x00C0
.error "Scissor and othermode must be at 0x00C0 for S2DEX"
.endif
// scissor (four 12-bit values)
scissorUpLeft: // the command byte is included since the command word is copied verbatim
.dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0)
scissorBottomRight:
.dw ((320 * 4) << 12) | ((240 * 4) << 0)
// othermode
otherMode0: // command byte included, same as above
.dw (G_RDPSETOTHERMODE << 24) | (0x080CFF)
otherMode1:
.dw 0x00000000
// Saved texrect state for combining the multiple input commands into one RDP texrect command
texrectWord1:
.fill 4 // first word, has command byte, xh and yh
texrectWord2:
.fill 4 // second word, has tile, xl, yl
// First half of RDP value for split commands; overwritten by numLightsxSize
rdpHalf1Val:
.fill 4
dirLightsXfrmValid:
.db 0
numLightsxSize:
.db 0 // Overwrites rdpHalf1Val when written
// displaylist stack length
displayListStackLength:
.db 0x00 // starts at 0, increments by 4 for each "return address" pushed onto the stack
// Is M inverse transpose valid or does it need to be recomputed. Zeroed when modifying M.
mITValid:
.db 0
// viewport
viewport:
.fill 16
// Current RDP fifo output position
rdpFifoPos:
.fill 4
matrixStackPtr:
.dw 0x00000000
// segment table
segmentTable:
.fill (4 * 16) // 16 DRAM pointers
// displaylist stack
displayListStack:
// ucode text (shared with DL stack)
.ascii ID_STR, 0x0A
endIdStr:
.if endIdStr < 0x180
.fill (0x180 - endIdStr)
.elseif endIdStr > 0x180
.error "ID_STR is too long"
.align 16 // to suppress subsequent errors
.endif
endSharedDMEM:
.if . != 0x180
.error "endSharedDMEM at incorrect address, matters for G_LOAD_UCODE / S2DEX"
.endif
// constants for register $v31
.if (. & 15) != 0
.error "Wrong alignment for v31value"
.endif
v31Value:
// v31 must go from lowest to highest (signed) values for vcc patterns.
// Also relies on the fact that $v31[0h] is -4,-4,-4,-4, 4, 4, 4, 4.
.dh -4 // used in clipping, vtx write for Newton-Raphson reciprocal
.dh -1 // used often
.dh 0 // used often
.dh 2 // used as clip ratio (vtx write, clipping) and in clipping
.dh 4 // used for same Newton-Raphsons, occlusion plane scaling
.dh 0x4000 // used in tri write, texgen
.dh 0x7F00 // used in fog, normals unpacking
.dh 0x7FFF // used often
// constants for register $v30
.if (. & 15) != 0
.error "Wrong alignment for v30value"
.endif
v30Value:
decalFixMult equ 0x0400
decalFixOff equ (-(decalFixMult / 2))
.dh vertexBuffer // currently 0x02DE; for converting vertex index to address
.dh vtxSize << 7 // 0x1300; it's not 0x2600 because vertex indices are *2
.dh 0x1000 // some multiplier in tri write, increment in vertex indices
.dh decalFixMult
.dh 0x0020 // some edge write thing in tri write; formerly Z scale factor
.dh 0xFFF8 // used once in tri write, mask away lower ST bits
.dh decalFixOff // negative
.dh 0x0100 // used several times in tri write
.macro set_vcc_11110001 // Only VCC pattern used with $v30
vge $v29, $v30, $v30[7]
.endmacro
.if (vertexBuffer < 0x0100 || decalFixMult < 0x100)
.error "VCC pattern for $v30 corrupted"
.endif
v30_VB equ $v30[0] // Vertex Buffer
v30_VS equ $v30[1] // Vertex Size
v30_1000 equ $v30[2]
v30_DM equ $v30[3] // Decal Multiplier
v30_0020 equ $v30[4]
v30_FFF8 equ $v30[5]
v30_DO equ $v30[6] // Decal Offset
v30_0100 equ $v30[7]
/*
Quick note on Newton-Raphson:
https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
Given input D, we want to find the reciprocal R. The base formula for refining
the estimate of R is R_new = R*(2 - D*R). However, since the RSP reciprocal
instruction moves the radix point 1 to the left, the result has to be multiplied
by 2. So it's 2*R*(2 - D*2*R) = R*(4 - 4*D*R) = R*(1*4 + D*R*-4). This is where
the 4 and -4 come from. For tri write, the result needs to be multiplied by 4
for subpixels, so it's 16 and -16.
*/
cameraWorldPos:
.skip 6
tempTriRA:
.skip 2 // Overwritten as part of camera world position, used as temp
lightBufferLookat:
.skip 8 // s8 X0, Y0, Z0, dummy, X1, Y1, Z1, dummy
lightBufferMain:
.skip (G_MAX_LIGHTS * lightSize)
lightBufferAmbient:
.skip 8 // just colors for ambient light
ltBufOfs equ (lightBufferMain - altBase)
occlusionPlaneEdgeCoeffs:
/*
NOTE: This explanation is outdated; see cpu/occlusionplane.c
Vertex is in occlusion region if all five equations below are true:
4 * screenX[s13.2] * c0[s0.15] - 0.5 * screenY[s13.2] < c4[s14.1]
4 * screenY[s13.2] * c1[s0.15] - 0.5 * screenX[s13.2] < c5[s14.1]
4 * screenX[s13.2] * c2[s0.15] + 0.5 * screenY[s13.2] < c6[s14.1]
4 * screenY[s13.2] * c3[s0.15] + 0.5 * screenX[s13.2] < c7[s14.1]
clamp_to_0.s15(clipX[s15.16] * kx[0.s15])
+ clamp_to_0.s15(clipY[s15.16] * ky[0.s15])
+ clamp_to_0.s15(clipZ[s15.16] * kz[0.s15])
+ kc[0.s15]
>= 0
The first four can be rewritten as (again, vertex is occluded if all are true):
screenY > screenX * 8*c0 + -2*c4
screenX > screenY * 8*c1 + -2*c5
screenY < screenX * -8*c2 + 2*c6
screenX < screenY * -8*c3 + 2*c7
where screenX and screenY are in subpixels (e.g. screenX = 100 = 25.0 pixels),
c0-c3 are shorts representing -1:0.99997,
and c4-c7 are shorts representing "half pixels" (e.g. c4 = 50 = 25.0 pixels)
For the last equation, one option is to think of kx through kc as in s10.5 mode
instead, so a value of 0x0020 is 1.0 and they can range from -0x400.00 to
0x3FF.F8. This choice is because clipZ ranges from 0x0000.0000 at the camera
plane to 0x03FF.0000 at the maximum distance away. The normal distance Adult
Link is from the camera is about 0x00B0.0000.
A better option is to develop your plane equation in floating point, e.g.
clipX[f] * -0.2f + clipY[f] * 0.4f + clipZ[f] * 1.0f + -200.0f >= 0
then multiply everything by (32768.0f / max(abs(kx), abs(ky), abs(kz), abs(kc)))
(here 32768.0f / 200.0f = 163.84f)
clipX[f] * -32.77f + clipY[f] * 65.54f + clipZ[f] * 163.84f + -32768
*/
.dh 0x0000 // c0
.dh 0x0000 // c1
.dh 0x0000 // c2
.dh 0x0000 // c3
.dh 0x8000 // c4
.dh 0x8000 // c5
.dh 0x8000 // c6
.dh 0x8000 // c7
occlusionPlaneMidCoeffs:
.dh 0x0000 // kx
.dh 0x0000 // ky
.dh 0x0000 // kz
.dh 0x8000 // kc
// Alternate base address because vector load offsets can't reach all of DMEM.
// altBaseReg permanently points here.
altBase:
fxParams:
.if (. & 15) != 0
.error "Wrong alignment for fxParams"
.endif
// First 8 values here loaded with lqv.
aoAmbientFactor:
.dh 0xFFFF
aoDirectionalFactor:
.dh 0xA000
aoPointFactor:
.dh 0x0000
perspNorm:
.dh 0xFFFF
texgenLinearCoeffs:
.dh 0x44D3
.dh 0x6CB3
fresnelScale:
.dh 0x0000
fresnelOffset:
.dh 0x0000
attrOffsetST:
.dh 0x0100
.dh 0xFF00
alphaCompareCullMode:
.db 0x00 // 0 = disabled, 1 = cull if all < thresh, -1 = cull if all >= thresh
alphaCompareCullThresh:
.db 0x00 // Alpha threshold, 00 - FF
materialCullMode: // Overwritten to 0 by SPNormalsMode, but that should not
.db 0 // happen in the middle of tex setup
normalsMode:
.db 0 // Overwrites materialCullMode
lastMatDLPhyAddr:
.dw 0
activeClipPlanes:
.dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping
// Constants for clipping algorithm
clipCondShifts:
.db CLIP_SCAL_NY_SHIFT
.db CLIP_SCAL_PY_SHIFT
.db CLIP_SCAL_NX_SHIFT
.db CLIP_SCAL_PX_SHIFT
// Movemem table
movememTable:
.dh tempMatrix // G_MTX multiply temp matrix (model)
.dh mMatrix // G_MV_MMTX
.dh tempMatrix // G_MTX multiply temp matrix (projection)
.dh vpMatrix // G_MV_PMTX
.dh viewport // G_MV_VIEWPORT
.dh cameraWorldPos // G_MV_LIGHT
// moveword table
movewordTable:
.dh fxParams // G_MW_FX
.dh numLightsxSize - 3 // G_MW_NUMLIGHT
.dh 0 // unused
.dh segmentTable // G_MW_SEGMENT
.dh fogFactor // G_MW_FOG
.dh lightBufferMain // G_MW_LIGHTCOL
.macro jumpTableEntry, addr
.dh addr & 0xFFFF
.endmacro
// G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table
movememHandlerTable:
jumpTableEntry G_POPMTX_end // G_POPMTX
jumpTableEntry G_MTX_end // G_MTX (multiply)
jumpTableEntry G_MOVEMEM_end // G_MOVEMEM, G_MTX (load)
.macro miniTableEntry, addr
.if addr < 0x1000 || addr >= 0x1400
.error "Handler address out of range!"
.endif
.db (addr - 0x1000) >> 2
.endmacro
// RDP/Immediate Command Mini Table
// 1 byte per entry, after << 2 points to an addr in first 1/4 of IMEM
miniTableEntry G_MEMSET_handler
miniTableEntry G_DMA_IO_handler
miniTableEntry G_TEXTURE_handler
miniTableEntry G_POPMTX_handler
miniTableEntry G_GEOMETRYMODE_handler
miniTableEntry G_MTX_handler
miniTableEntry G_MOVEWORD_handler
miniTableEntry G_MOVEMEM_handler
miniTableEntry G_LOAD_UCODE_handler
miniTableEntry G_DL_handler
miniTableEntry G_ENDDL_handler
miniTableEntry G_SPNOOP_handler
miniTableEntry G_RDPHALF_1_handler
miniTableEntry G_SETOTHERMODE_L_handler
miniTableEntry G_SETOTHERMODE_H_handler
miniTableEntry G_TEXRECT_handler
miniTableEntry G_TEXRECTFLIP_handler
miniTableEntry G_SYNC_handler // G_RDPLOADSYNC
miniTableEntry G_SYNC_handler // G_RDPPIPESYNC
miniTableEntry G_SYNC_handler // G_RDPTILESYNC
miniTableEntry G_SYNC_handler // G_RDPFULLSYNC
miniTableEntry G_RDP_handler // G_SETKEYGB
miniTableEntry G_RDP_handler // G_SETKEYR
miniTableEntry G_RDP_handler // G_SETCONVERT
miniTableEntry G_SETSCISSOR_handler
miniTableEntry G_RDP_handler // G_SETPRIMDEPTH
miniTableEntry G_RDPSETOTHERMODE_handler
miniTableEntry load_cmds_handler // G_LOADTLUT
miniTableEntry G_RDPHALF_2_handler
miniTableEntry G_RDP_handler // G_SETTILESIZE
miniTableEntry load_cmds_handler // G_LOADBLOCK
miniTableEntry load_cmds_handler // G_LOADTILE
miniTableEntry G_RDP_handler // G_SETTILE
miniTableEntry G_RDP_handler // G_FILLRECT
miniTableEntry G_RDP_handler // G_SETFILLCOLOR
miniTableEntry G_RDP_handler // G_SETFOGCOLOR
miniTableEntry G_RDP_handler // G_SETBLENDCOLOR
miniTableEntry G_RDP_handler // G_SETPRIMCOLOR
miniTableEntry G_RDP_handler // G_SETENVCOLOR
miniTableEntry G_RDP_handler // G_SETCOMBINE
miniTableEntry G_SETxIMG_handler // G_SETTIMG
miniTableEntry G_SETxIMG_handler // G_SETZIMG
miniTableEntry G_SETxIMG_handler // G_SETCIMG
cmdMiniTable:
miniTableEntry G_SYNC_handler // G_NOOP
miniTableEntry G_VTX_handler
miniTableEntry G_MODIFYVTX_handler
miniTableEntry G_CULLDL_handler
miniTableEntry G_BRANCH_WZ_handler
miniTableEntry G_TRI1_handler
miniTableEntry G_TRI2_handler
miniTableEntry G_QUAD_handler
miniTableEntry G_TRISTRIP_handler
miniTableEntry G_TRIFAN_handler
miniTableEntry G_LIGHTTORDP_handler
miniTableEntry G_RELSEGMENT_handler
// The maximum number of generated vertices in a clip polygon. In reality, this
// is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately.
// In case you're wondering if it's possible to have a 7-vertex polygon where all
// 7 verts are generated, it looks like this (X = generated vertex):
// ___----=>
// +---------------__X----X _-^
// | __--^^ X^
// | __--^^ _-^|
// _X^^^ _-^ |
// C | _-^ |
// ^X _-^ |
// |\ _-^ |
// +-X--_X^---------------+
// V^
MAX_CLIP_GEN_VERTS equ 7
// Normally, each clip plane can cut off a "tip" of a polygon, turning one vert
// into two. (It can also cut off more of the polygon and remove additional verts,
// but the maximum is one more vert per clip plane.) So with 5 clip planes, we
// could have a maximum of 8 verts in the final polygon. However, the verts
// generated by the no-nearclipping plane will always be at infinity, so they
// will always get replaced by generated verts from one of the other clip planes.
// Put another way, if there are 8 verts in the final polygon, there are 8 edges,
// which are portions of the 3 original edges plus portions of 5 edges along the
// 5 clip planes. But the edge portion along the no-nearclipping plane is at
// infinity, so that edge can't be on screen.
// It is rare but possible for these assumptions to be violated and a polygon
// with more than 7 verts to be generated. For example, numerical precision
// issues could cause the polygon to be slightly non-convex at one of the clip
// planes, causing the plane to cut off more than one tip. However, this
// implementation checks for an imminent overflow and aborts clipping (draws no
// tris) if this occurs. Because this is caused by extreme/degenerate cases like
// the camera exactly on a tri, not drawing anything is an okay result.
MAX_CLIP_POLY_VERTS equ 7
CLIP_POLY_SIZE_BYTES equ (MAX_CLIP_POLY_VERTS+1) * 2
CLIP_TEMP_VERTS_SIZE_BYTES equ (MAX_CLIP_GEN_VERTS * vtxSize)
VERTEX_BUFFER_SIZE_BYTES equ (G_MAX_VERTS * vtxSize)
RDP_CMD_BUFSIZE equ 0xB0
RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command
RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS)
INPUT_BUFFER_CMDS equ 21
INPUT_BUFFER_SIZE_BYTES equ (INPUT_BUFFER_CMDS * 8)
END_VARIABLE_LEN_DMEM equ (0xFC0 - INPUT_BUFFER_SIZE_BYTES - (2 * RDP_CMD_BUFSIZE_TOTAL) - (2 * CLIP_POLY_SIZE_BYTES) - CLIP_TEMP_VERTS_SIZE_BYTES - VERTEX_BUFFER_SIZE_BYTES)
startFreeDmem:
.org END_VARIABLE_LEN_DMEM
endFreeDmem:
// Main vertex buffer in RSP internal format
vertexBuffer:
.skip VERTEX_BUFFER_SIZE_BYTES
// Space for temporary verts for clipping code, and reused for other things
clipTempVerts:
// Round up to 0x10
.org ((clipTempVerts + 0xF) & 0xFF0)
// Vertex addresses, to avoid a multiply-add for each vertex index lookup
vertexTable:
.skip ((G_MAX_VERTS + 8) * 2) // halfword for each vertex; need 1 extra end addr, easier to write 8 extra
.if . > yieldDataFooter
// Need to fit everything through vertex buffer in yield buffer, would like
// to also fit vertexTable to avoid recompute after yield
.error "Too much being stored in yieldable DMEM"
.endif
tempMatrix:
.skip 0x40
.if . > (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
.error "Too much in clipTempVerts"
.endif
.org (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
clipTempVertsEnd:
clipPoly:
.skip CLIP_POLY_SIZE_BYTES // 3 5 7 + term 0
clipPoly2: // \ / \ / \
.skip CLIP_POLY_SIZE_BYTES // 4 6 7 + term 0
// First RDP Command Buffer
rdpCmdBuffer1:
.skip RDP_CMD_BUFSIZE
.if (. & 8) != 8
.error "RDP command buffer alignment to 8 assumption broken"
.endif
rdpCmdBuffer1End:
.skip 8
rdpCmdBuffer1EndPlus1Word:
// This is so that we can temporarily store vector regs here with lqv/sqv
.skip RDP_CMD_BUFSIZE_EXCESS - 8
// Second RDP Command Buffer
rdpCmdBuffer2:
.skip RDP_CMD_BUFSIZE
.if (. & 8) != 8
.error "RDP command buffer alignment to 8 assumption broken"
.endif
rdpCmdBuffer2End:
.skip 8
rdpCmdBuffer2EndPlus1Word:
.skip RDP_CMD_BUFSIZE_EXCESS - 8
// Input buffer. After RDP cmd buffers so it can be vector addressed from end.
inputBuffer:
.skip INPUT_BUFFER_SIZE_BYTES
inputBufferEnd:
inputBufferEndSgn equ -(0x1000 - inputBufferEnd) // Underflow DMEM address
.if . != 0xFC0
.error "DMEM organization incorrect"
.endif
.org 0xFC0
// 0x0FC0-0x1000: OSTask
OSTask:
.skip 0x40
// The only thing used in the first 16 bytes of OSTask is flags, which we now
// set up correctly (zero) when loading another ucode. This is a negative offset
// relative to $zero to wrap around DMEM to the top.
fourthQWMVP equ -(0x1000 - (OSTask + OSTask_type))
// This word is not used by F3DEX3, S2DEX, or even boot. Reuse it as a temp.
startCounterTime equ (OSTask + OSTask_ucode_size)
// These two words are used by boot, but not by F3DEX3 or S2DEX.
xfrmLookatDirs equ -(0x1000 - (OSTask + OSTask_ucode_data)) // and OSTask_ucode_data_size
memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0)
memsetBufferMaxEnd equ (rdpCmdBuffer1 & 0xFF0)
memsetBufferMaxSize equ (memsetBufferMaxEnd - memsetBufferStart)
memsetBufferSize equ (memsetBufferMaxSize > 0x800 ? 0x800 : memsetBufferMaxSize)
.close // DATA_FILE
////////////////////////////////////////////////////////////////////////////////
/////////////////////////////// Register Naming ////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Vertex / lighting all regs:
vM0I equ $v0 // mMatrix rows int/frac
vM1I equ $v1 // Valid in vertex, lighting, and M inverse transpose
vM2I equ $v2
vM3I equ $v3
vM0F equ $v4
vM1F equ $v5
vM2F equ $v6
vM3F equ $v7
vVP0I equ $v8 // vpMatrix rows int/frac
vVP1I equ $v9 // Valid in vertex and lighting only
vVP2I equ $v10
vVP3I equ $v11
vVP0F equ $v12
vVP1F equ $v13
vVP2F equ $v14
vVP3F equ $v15
// Lighting and vertex load:
vPairNrml equ $v16 // Vertex pair normals (model then world space)
vPairLt equ $v17 // Vertex pair total light color/intensity (RGB-RGB-)
vNrmOut equ $v18 // Output of lt_normalize (rarely used, but needed as all temps used)
// $v19 not used during vertex / lighting
vPairPosI equ $v20 // Vertex pair model / world space position int/frac
vPairPosF equ $v21
vPairST equ $v22 // Vertex pair ST texture coordinates
vPairTPosF equ $v23 // Vertex pair transformed (clip / screen) space position frac/int
vPairTPosI equ $v24
.if CFG_LEGACY_VTX_PIPE // One pair is outputs of vert mtx xfrm, other is temps
vAAA equ $v20
vBBB equ $v21
sOUTF equ vPairTPosF
sOUTI equ vPairTPosI
.else
sOUTF equ vPairPosF
sOUTI equ vPairPosI
vAAA equ $v23 // Temps
vBBB equ $v24
.endif
vCCC equ $v25
vDDD equ $v26
vPairRGBA equ $v27 // Vertex pair color
// Vertex write, after lighting:
// Global:
vOne equ $v28 // Global, all elements = 1
// $v29: permanent temp register, also write results here to discard
// $v30: parameters for vertex/lighting; other constants for tri write
// $v31: Only global constant vector register
// For tri write only:
vZero equ $v0 // all elements = 0
// Global and semi-global (i.e. one main function + occasional local) scalar regs:
// $zero // Hardwired zero scalar register
perfCounterD equ $12 // Performance counter D (functions depend on config)
altBaseReg equ $13 // Alternate base address register for vector loads
inputVtxPos equ $14 // Pointer to loaded vertex to transform
outputVtxPos equ $15 // Pointer to vertex buffer to store transformed verts
clipFlags equ $16 // Current clipping flags being checked
clipPolyRead equ $17 // Read pointer within current polygon being clipped
clipPolySelect equ $18 // Clip poly double buffer selection, or < 0 for normal tri write
clipPolyWrite equ $21 // Write pointer within current polygon being clipped
rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf
rdpCmdBufPtr equ $23 // RDP command buffer current DMEM pointer
cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr
cmd_w0 equ $25 // DL command word 0, also holds next tris info
taskDataPtr equ $26 // Task data (display list) DRAM pointer
inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end
perfCounterA equ $28 // Performance counter A (functions depend on config)
perfCounterB equ $29 // Performance counter B (functions depend on config)
perfCounterC equ $30 // Performance counter C (functions depend on config)
// $ra // Return address
// Misc scalar regs:
clipMaskIdx equ $6
secondVtxPos equ $8
curLight equ $9
// Arguments to dma_read_write
dmaLen equ $19 // also used by itself
dmemAddr equ $20
// cmd_w1_dram // used for all dma_read_write DRAM addresses
// Argument to load_overlay*
postOvlRA equ $10 // Commonly used locally
// ==== Summary of uses of all registers
// $zero: Hardwired zero scalar register
// $1: vertex 1 addr, zero when command handler is called, count of
// remaining vertices * 0x10, pointer to store texture coefficients, local
// $2: vertex 2 addr, vertex at end of edge during clipping, pointer to store
// shade coefficients, local
// $3: vertex 3 addr, vertex at start of edge during clipping, local
// $4: pre-shuffle vertex 1 addr for flat shading during tri write (global)
// $5: geometry mode middle 2 bytes during vertex load / lighting, local
// $6: clipMaskIdx, geometry mode low byte during tri write, local
// $7: command byte when command handler is called, mIT recompute flag in
// Overlay 4, local
// $8: secondVtxPos, local
// $9: curLight, clip mask during clipping, local
// $10: postOvlRA, common local
// $11: very common local
// $12: perfCounterD (global). This must be $12 for S2DEX compat in while_wait_dma_busy.
// $13: altBaseReg (global)
// $14: inputVtxPos, local
// $15: outputVtxPos, local
// $16: clipFlags (global)
// $17: clipPolyRead (global)
// $18: clipPolySelect (global)
// $19: dmaLen, onscreen vertex during clipping, local
// $20: dmemAddr, local
// $21: clipPolyWrite (global)
// $22: rdpCmdBufEndP1 (global)
// $23: rdpCmdBufPtr (global)
// $24: cmd_w1_dram, local
// $25: cmd_w0 (global); holds next tris info during tri write -> clipping ->
// vtx write
// $26: taskDataPtr (global)
// $27: inputBufferPos (global)
// $28: perfCounterA (global)
// $29: perfCounterB (global)
// $30: perfCounterC (global)
// $ra: Return address for jal, b*al
// vtx_store registers. They all start with s for store.
// armips only executes "equ" statements on the codepath where they are defined.
// However, it always parses all assembly instructions, even if they current codepath
// is not active. So, code like "A equ $20; add A, $11, $11" will cause an error
// on a disabled codepath, as the first statement is not executed but the second
// is parsed and A is not defined.
// For CFG_LEGACY_VTX_PIPE, use the registers which would normally be the VP matrix
// to store constants from setup, including through clipping. This does not save
// cycles during vertex processing because the loads are always hidden, but it saves
// two instructions each to save and restore them. (For ST it saves cycles too)
// Common for all
s1WI equ $v16
s1WF equ $v17
sRTF equ $v25
sRTI equ $v26
sSCF equ $v20
sSCI equ $v21
// Viewport scale/offset, ST scale/offset
.if CFG_LEGACY_VTX_PIPE
sVPS equ $v8
sVPO equ $v9
sSTS equ $v10
sSTO equ $v29 // not supported on LVP
.else
.if CFG_NO_OCCLUSION_PLANE
sVPS equ $v26
.else
sVPS equ $v16
.endif
sVPO equ $v17
sSTS equ $v25
sSTO equ $v26
.endif
// Misc
.if CFG_NO_OCCLUSION_PLANE
sFOG equ $v25
.if CFG_LEGACY_VTX_PIPE
sCLZ equ $v19
sTCL equ $v19
.else
sCLZ equ $v21
sTCL equ $v21
.endif
sTPN equ $v16
.else
sFOG equ $v16
sCLZ equ $v25
sTCL equ $v29 // does not exist on this codepath
sTPN equ $v18
.endif
// New LVP_NOC only
.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE
sKPI equ $v11
sKPF equ $v18
sKPG equ vBBB
sST2 equ $v11
sFGM equ $v12
.else
sKPI equ $v29 // do not exist
sKPF equ $v29
sKPG equ $v29
sST2 equ $v29
sFGM equ $v29
.endif
// Occlusion plane
.if CFG_NO_OCCLUSION_PLANE
sO03 equ $v29 // none of these exist
sO47 equ $v29
sOCM equ $v29
sOC1 equ $v29
sOC2 equ $v29
sOC3 equ $v29
sOPM equ $v29
sOPMs equ $v29
sOSC equ $v29
.else
sO03 equ $v26
sO47 equ $v23
sOCM equ $v22
sOC1 equ $v21
sOC2 equ $v27
sOC3 equ $v21
.if CFG_LEGACY_VTX_PIPE
sOPM equ $v12 // Kept here through whole processing
sOPMs equ $v12 // so these are the same
.else
sOPM equ $v17 // When used
sOPMs equ $v24 // Just another temp register
.endif
sOSC equ $v21
.endif
.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE
ltLookAt equ vCCC
.elseif CFG_LEGACY_VTX_PIPE
ltLookAt equ $v18
.else
ltLookAt equ $v29
.endif
vLookat0 equ vPairLt
vLookat1 equ vAAA
// Temp storage after rdpCmdBufEndP1. There is 0xA8 of space here which will
// always be free during vtx load or clipping.
tempViewportScale equ 0x00
tempViewportOffset equ 0x10
tempOccPlusMinus equ 0x20
tempVpRGBA equ 0x30
tempVpPkNorm equ 0x40
tempXfrmSingle equ 0x50
tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// IMEM //////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Macros for placing code in different places based on the microcode version
.macro instantiate_mtx_end_begin
// Multiplies the temp loaded matrix into the M or VP matrix
lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
li $3, tempMatrix // Input 1 = temp mem (loaded mtx)
jal while_wait_dma_busy
move $2, $6 // Input 0 = output
// Followed immediately by instantiate_mtx_multiply. These need to be broken
// up so we can insert the global mtx_multiply label between them.
.endmacro
.macro instantiate_mtx_multiply
// $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx
addi $10, $3, 0x0018
@@loop:
vmadn $v7, $v31, $v31[2] // 0
addi $11, $3, 0x0008
vmadh $v6, $v31, $v31[2] // 0
addi $2, $2, -0x0020
vmudh $v29, $v31, $v31[2] // 0
@@innerloop:
ldv $v3[0], 0x0040($2)
ldv $v3[8], 0x0040($2)
lqv $v1[0], 0x0020($3) // Input 1
ldv $v2[0], 0x0020($2)
ldv $v2[8], 0x0020($2)
lqv $v0[0], 0x0000($3) // Input 1
vmadl $v29, $v3, $v1[0h]
addi $3, $3, 0x0002
vmadm $v29, $v2, $v1[0h]
addi $2, $2, 0x0008 // Increment input 0 pointer
vmadn $v5, $v3, $v0[0h]
bne $3, $11, @@innerloop
vmadh $v4, $v2, $v0[0h]