-
Notifications
You must be signed in to change notification settings - Fork 6
/
mult87.a
162 lines (150 loc) · 4.6 KB
/
mult87.a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
; mult87.a
; from https://github.com/MVittiS/Fast6502Mult/blob/master/mult4packed.s
;
; 8 bit x 8 bit unsigned multiply, 16 bit result
; Average cycles: 174.0
; 630 bytes
multA = $0
multArot = $1
multB = $2
multBrot = $3
multAhigh = $4
multArothigh = $5
multBlow = $6
multBrotlow = $7
p0 = $0
p1 = $2
p2 = $3
p3 = $1
; The idea behind the algorithm is to perform the multiplication in 4x4 bit
; chunks, using two tables - one for multiplication, and another for rotation.
; We start by taking our two arguments and pre-fetching their rotated
; equivalents, to compose four partial products P0-P3:
;
; P0 = (RotA & 0xF0) | (B & 0xF)
; P1 = (A & 0xF0) | (B & 0xF)
; P2 = (RotA & 0xF0) | (RotB & 0xF)
; P3 = (A & 0xF0) | (RotB & 0xF)
;
; From which we can assemble the final sum:
;
; Clow = P0
; Clow += (RotP1 & 0xF0)
; Clow += (RotP2 & 0xF0)
; Chigh += (RotP1 & 0xF)
; Chigh += (RotP2 & 0xF)
; Chigh += P3
;
; And we're good to go.
* = $0200
mult44table:
!byte $0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0,$0
!byte $0,$1,$2,$3,$4,$5,$6,$7,$8,$9,$a,$b,$c,$d,$e,$f
!byte $0,$2,$4,$6,$8,$a,$c,$e,$10,$12,$14,$16,$18,$1a,$1c,$1e
!byte $0,$3,$6,$9,$c,$f,$12,$15,$18,$1b,$1e,$21,$24,$27,$2a,$2d
!byte $0,$4,$8,$c,$10,$14,$18,$1c,$20,$24,$28,$2c,$30,$34,$38,$3c
!byte $0,$5,$a,$f,$14,$19,$1e,$23,$28,$2d,$32,$37,$3c,$41,$46,$4b
!byte $0,$6,$c,$12,$18,$1e,$24,$2a,$30,$36,$3c,$42,$48,$4e,$54,$5a
!byte $0,$7,$e,$15,$1c,$23,$2a,$31,$38,$3f,$46,$4d,$54,$5b,$62,$69
!byte $0,$8,$10,$18,$20,$28,$30,$38,$40,$48,$50,$58,$60,$68,$70,$78
!byte $0,$9,$12,$1b,$24,$2d,$36,$3f,$48,$51,$5a,$63,$6c,$75,$7e,$87
!byte $0,$a,$14,$1e,$28,$32,$3c,$46,$50,$5a,$64,$6e,$78,$82,$8c,$96
!byte $0,$b,$16,$21,$2c,$37,$42,$4d,$58,$63,$6e,$79,$84,$8f,$9a,$a5
!byte $0,$c,$18,$24,$30,$3c,$48,$54,$60,$6c,$78,$84,$90,$9c,$a8,$b4
!byte $0,$d,$1a,$27,$34,$41,$4e,$5b,$68,$75,$82,$8f,$9c,$a9,$b6,$c3
!byte $0,$e,$1c,$2a,$38,$46,$54,$62,$70,$7e,$8c,$9a,$a8,$b6,$c4,$d2
!byte $0,$f,$1e,$2d,$3c,$4b,$5a,$69,$78,$87,$96,$a5,$b4,$c3,$d2,$e1
rot4table:
!byte $0,$10,$20,$30,$40,$50,$60,$70,$80,$90,$a0,$b0,$c0,$d0,$e0,$f0
!byte $1,$11,$21,$31,$41,$51,$61,$71,$81,$91,$a1,$b1,$c1,$d1,$e1,$f1
!byte $2,$12,$22,$32,$42,$52,$62,$72,$82,$92,$a2,$b2,$c2,$d2,$e2,$f2
!byte $3,$13,$23,$33,$43,$53,$63,$73,$83,$93,$a3,$b3,$c3,$d3,$e3,$f3
!byte $4,$14,$24,$34,$44,$54,$64,$74,$84,$94,$a4,$b4,$c4,$d4,$e4,$f4
!byte $5,$15,$25,$35,$45,$55,$65,$75,$85,$95,$a5,$b5,$c5,$d5,$e5,$f5
!byte $6,$16,$26,$36,$46,$56,$66,$76,$86,$96,$a6,$b6,$c6,$d6,$e6,$f6
!byte $7,$17,$27,$37,$47,$57,$67,$77,$87,$97,$a7,$b7,$c7,$d7,$e7,$f7
!byte $8,$18,$28,$38,$48,$58,$68,$78,$88,$98,$a8,$b8,$c8,$d8,$e8,$f8
!byte $9,$19,$29,$39,$49,$59,$69,$79,$89,$99,$a9,$b9,$c9,$d9,$e9,$f9
!byte $a,$1a,$2a,$3a,$4a,$5a,$6a,$7a,$8a,$9a,$aa,$ba,$ca,$da,$ea,$fa
!byte $b,$1b,$2b,$3b,$4b,$5b,$6b,$7b,$8b,$9b,$ab,$bb,$cb,$db,$eb,$fb
!byte $c,$1c,$2c,$3c,$4c,$5c,$6c,$7c,$8c,$9c,$ac,$bc,$cc,$dc,$ec,$fc
!byte $d,$1d,$2d,$3d,$4d,$5d,$6d,$7d,$8d,$9d,$ad,$bd,$cd,$dd,$ed,$fd
!byte $e,$1e,$2e,$3e,$4e,$5e,$6e,$7e,$8e,$9e,$ae,$be,$ce,$de,$ee,$fe
!byte $f,$1f,$2f,$3f,$4f,$5f,$6f,$7f,$8f,$9f,$af,$bf,$cf,$df,$ef,$ff
mult4packed:
; We first start by getting the arguments from the registers, and getting
; their rotated equivalents.
stx multA ; First multiplicand
lda rot4table, x
sta multArot ; First multiplicand, rotated
sty multB ; Second multiplicand
tya
tax
lda rot4table, x
sta multBrot ; Second multiplicand, rotated
; Then, we extract the four ingredients we'll need:
; A high, RotA high, B low, and RotB low.
and #$F
sta multBrotlow
lda multB
and #$F
sta multBlow
lda multA
and #$F0
sta multAhigh
lda multArot
and #$F0
sta multArothigh
; Now, we assemble our partial products, starting with an
; element already in the accumulator - P0. The rest follows.
; Finally, we use the partial products as arguments in a look-up
; multiplication table, and start building our products.
; P0
ora multBlow
tax
lda mult44table, x
sta p0
; P1
lda multAhigh
ora multBlow
tax
lda mult44table, x
tax
lda rot4table, x
sta p1
; P2
lda multArothigh
ora multBrotlow
tax
lda mult44table, x
tax
lda rot4table, x
sta p2
; P3
lda multAhigh
ora multBrotlow
tax
lda mult44table, x
sta p3
; The products P0 and P3 are ready, and we just have to decompose
; and add P1 and P2 now.
clc
; First we sum P1...
lda p1
and #$F0
adc p0
sta p0
lda p1
and #$F
adc p3
sta p3
; ...and finally we sum P2. Done!
lda p2
and #$F0
adc p0
sta p0
lda p2
and #$F
adc p3
sta p3
rts