-
Notifications
You must be signed in to change notification settings - Fork 43
/
mufplib.S
1311 lines (1208 loc) · 31.8 KB
/
mufplib.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* Copyright (c) 2020 Mark Owen https://www.quinapalus.com .
*
* Raspberry Pi (Trading) Ltd (Licensor) hereby grants to you a non-exclusive license to use the software solely on a
* Raspberry Pi RP2040 device. No other use is permitted under the terms of this license.
*
* This software is also available from the copyright owner under GPLv2 licence.
*
* THIS SOFTWARE IS PROVIDED BY THE LICENSOR AND COPYRIGHT OWNER "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE LICENSOR OR COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.syntax unified
.cpu cortex-m0plus
.thumb
@ exported symbols
.global mufp_lib_start
.global mufp_fadd
.global mufp_fsub
.global mufp_fmul
.global mufp_fdiv
.global mufp_fcmp_combined
.global mufp_fcmp_fast
.global mufp_fcmp_fast_flags
.global mufp_fsqrt
.global mufp_float2int
.global mufp_float2fix
.global mufp_float2uint
.global mufp_float2ufix
.global mufp_int2float
.global mufp_fix2float
.global mufp_uint2float
.global mufp_ufix2float
.global mufp_int642float
.global mufp_fix642float
.global mufp_uint642float
.global mufp_ufix642float
.global mufp_fcos
.global mufp_fsin
.global mufp_fsincos
.global mufp_ftan
.global mufp_fatan2
.global mufp_fexp
.global mufp_fln
.global mufp_lib_end
#ifndef USE_HW_DIV
.equ use_hw_div,0
#else
.equ use_hw_div,1
#endif
.equ IOPORT ,0xd0000000
.equ DIV_UDIVIDEND,0x00000060
.equ DIV_UDIVISOR ,0x00000064
.equ DIV_QUOTIENT ,0x00000070
.equ DIV_CSR ,0x00000078
mufp_lib_start:
@ exchange r0<->r1, r2<->r3
xchxy:
push {r0,r2,r14}
mov r0,r1
mov r2,r3
pop {r1,r3,r15}
@ IEEE single in r0-> signed (two's complemennt) mantissa in r0 9Q23 (24 significant bits), signed exponent (bias removed) in r2
@ trashes r4; zero, denormal -> mantissa=+/-1, exponent=-380; Inf, NaN -> mantissa=+/-1, exponent=+640
unpackx:
lsrs r2,r0,#23 @ save exponent and sign
lsls r0,#9 @ extract mantissa
lsrs r0,#9
movs r4,#1
lsls r4,#23
orrs r0,r4 @ reinstate implied leading 1
cmp r2,#255 @ test sign bit
uxtb r2,r2 @ clear it
bls 1f @ branch on positive
rsbs r0,#0 @ negate mantissa
1:
subs r2,#1
cmp r2,#254 @ zero/denormal/Inf/NaN?
bhs 2f
subs r2,#126 @ remove exponent bias: can now be -126..+127
bx r14
2: @ here with special-case values
cmp r0,#0
mov r0,r4 @ set mantissa to +1
bpl 3f
rsbs r0,#0 @ zero/denormal/Inf/NaN: mantissa=+/-1
3:
subs r2,#126 @ zero/denormal: exponent -> -127; Inf, NaN: exponent -> 128
lsls r2,#2 @ zero/denormal: exponent -> -508; Inf, NaN: exponent -> 512
adds r2,#128 @ zero/denormal: exponent -> -380; Inf, NaN: exponent -> 640
bx r14
@ normalise and pack signed mantissa in r0 nominally 3Q29, signed exponent in r2-> IEEE single in r0
@ trashes r4, preserves r1,r3
@ r5: "sticky bits", must be zero iff all result bits below r0 are zero for correct rounding
packx:
lsrs r4,r0,#31 @ save sign bit
lsls r4,r4,#31 @ sign now in b31
bpl 2f @ skip if positive
cmp r5,#0
beq 11f
adds r0,#1 @ fiddle carry in to following rsb if sticky bits are non-zero
11:
rsbs r0,#0 @ can now treat r0 as unsigned
packx0:
bmi 3f @ catch r0=0x80000000 case
2:
subs r2,#1 @ normalisation loop
adds r0,r0
beq 1f @ zero? special case
bpl 2b @ normalise so leading "1" in bit 31
3:
adds r2,#129 @ (mis-)offset exponent
bne 12f @ special case: highest denormal can round to lowest normal
adds r0,#0x80 @ in special case, need to add 256 to r0 for rounding
bcs 4f @ tripped carry? then have leading 1 in C as required
12:
adds r0,#0x80 @ rounding
bcs 4f @ tripped carry? then have leading 1 in C as required (and result is even so can ignore sticky bits)
cmp r5,#0
beq 7f @ sticky bits zero?
8:
lsls r0,#1 @ remove leading 1
9:
subs r2,#1 @ compensate exponent on this path
4:
cmp r2,#254
bge 5f @ overflow?
adds r2,#1 @ correct exponent offset
ble 10f @ denormal/underflow?
lsrs r0,#9 @ align mantissa
lsls r2,#23 @ align exponent
orrs r0,r2 @ assemble exponent and mantissa
6:
orrs r0,r4 @ apply sign
1:
bx r14
5:
movs r0,#0xff @ create infinity
lsls r0,#23
b 6b
10:
movs r0,#0 @ create zero
bx r14
7: @ sticky bit rounding case
lsls r5,r0,#24 @ check bottom 8 bits of r0
bne 8b @ in rounding-tie case?
lsrs r0,#9 @ ensure even result
lsls r0,#10
b 9b
.align 2
.ltorg
@ signed multiply r0 1Q23 by r1 4Q23, result in r0 7Q25, sticky bits in r5
@ trashes r3,r4
mul0:
uxth r3,r0 @ Q23
asrs r4,r1,#16 @ Q7
muls r3,r4 @ L*H, Q30 signed
asrs r4,r0,#16 @ Q7
uxth r5,r1 @ Q23
muls r4,r5 @ H*L, Q30 signed
adds r3,r4 @ sum of middle partial products
uxth r4,r0
muls r4,r5 @ L*L, Q46 unsigned
lsls r5,r4,#16 @ initialise sticky bits from low half of low partial product
lsrs r4,#16 @ Q25
adds r3,r4 @ add high half of low partial product to sum of middle partial products
@ (cannot generate carry by limits on input arguments)
asrs r0,#16 @ Q7
asrs r1,#16 @ Q7
muls r0,r1 @ H*H, Q14 signed
lsls r0,#11 @ high partial product Q25
lsls r1,r3,#27 @ sticky
orrs r5,r1 @ collect further sticky bits
asrs r1,r3,#5 @ middle partial products Q25
adds r0,r1 @ final result
bx r14
.thumb_func
mufp_fcmp_combined:
@ *****
@ WARNING: this code is required by the wrapper functions to preserve R3
@ *****
lsls r2,r0,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
1:
lsrs r0,#23 @ clear mantissa if NaN or denormal
lsls r0,#23
2:
lsls r2,r1,#1
lsrs r2,#24
beq 1f
cmp r2,#0xff
bne 2f
1:
lsrs r1,#23 @ clear mantissa if NaN or denormal
lsls r1,#23
2:
.thumb_func
mufp_fcmp_fast_flags:
.thumb_func
mufp_fcmp_fast:
movs r2,#1 @ initialise result
eors r1,r0
bmi 4f @ opposite signs? then can proceed on basis of sign of x
eors r1,r0 @ restore y
bpl 1f
rsbs r2,#0 @ both negative? flip comparison
1:
cmp r0,r1
bgt 2f
blt 3f
5:
movs r2,#0
3:
rsbs r2,#0
2:
subs r0,r2,#0
bx r14
4:
orrs r1,r0
adds r1,r1
beq 5b
cmp r0,#0
bge 2b
b 3b
@ convert float to signed int, rounding towards -Inf, clamping
.thumb_func
mufp_float2int:
movs r1,#0 @ fall through
@ convert float in r0 to signed fixed point in r0, clamping
.thumb_func
mufp_float2fix:
push {r4,r14}
bl unpackx
movs r3,r2
adds r3,#130
bmi 6f @ -0?
add r2,r1 @ incorporate binary point position into exponent
subs r2,#23 @ r2 is now amount of left shift required
blt 1f @ requires right shift?
cmp r2,#7 @ overflow?
ble 4f
3: @ overflow
asrs r1,r0,#31 @ +ve:0 -ve:0xffffffff
mvns r1,r1 @ +ve:0xffffffff -ve:0
movs r0,#1
lsls r0,#31
5:
eors r0,r1 @ +ve:0x7fffffff -ve:0x80000000 (unsigned path: 0xffffffff)
pop {r4,r15}
1:
rsbs r2,#0 @ right shift for r0, >0
cmp r2,#32
blt 2f @ more than 32 bits of right shift?
movs r2,#32
2:
asrs r0,r0,r2
pop {r4,r15}
6:
movs r0,#0
pop {r4,r15}
@ unsigned version
.thumb_func
mufp_float2uint:
movs r1,#0 @ fall through
.thumb_func
mufp_float2ufix:
push {r4,r14}
bl unpackx
add r2,r1 @ incorporate binary point position into exponent
movs r1,r0
bmi 5b @ negative? return zero
subs r2,#23 @ r2 is now amount of left shift required
blt 1b @ requires right shift?
mvns r1,r0 @ ready to return 0xffffffff
cmp r2,#8 @ overflow?
bgt 5b
4:
lsls r0,r0,r2 @ result fits, left shifted
pop {r4,r15}
@ convert uint64 to float, rounding
.thumb_func
mufp_uint642float:
movs r2,#0 @ fall through
@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
.thumb_func
mufp_ufix642float:
push {r4,r5,r14}
cmp r1,#0
bpl 3f @ positive? we can use signed code
lsls r5,r1,#31 @ contribution to sticky bits
orrs r5,r0
lsrs r0,r1,#1
subs r2,#1
b 4f
@ convert int64 to float, rounding
.thumb_func
mufp_int642float:
movs r2,#0 @ fall through
@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
.thumb_func
mufp_fix642float:
push {r4,r5,r14}
3:
movs r5,r0
orrs r5,r1
beq ret_pop45 @ zero? return +0
asrs r5,r1,#31 @ sign bits
2:
asrs r4,r1,#24 @ try shifting 7 bits at a time
cmp r4,r5
bne 1f @ next shift will overflow?
lsls r1,#7
lsrs r4,r0,#25
orrs r1,r4
lsls r0,#7
adds r2,#7
b 2b
1:
movs r5,r0
movs r0,r1
4:
rsbs r2,#0
adds r2,#32+29
b packret
@ convert signed int to float, rounding
.thumb_func
mufp_int2float:
movs r1,#0 @ fall through
@ convert signed fix to float, rounding; number of r0 bits after point in r1
.thumb_func
mufp_fix2float:
push {r4,r5,r14}
1:
movs r2,#29
subs r2,r1 @ fix exponent
packretns: @ pack and return, sticky bits=0
movs r5,#0
packret: @ common return point: "pack and return"
bl packx
ret_pop45:
pop {r4,r5,r15}
@ unsigned version
.thumb_func
mufp_uint2float:
movs r1,#0 @ fall through
.thumb_func
mufp_ufix2float:
push {r4,r5,r14}
cmp r0,#0
bge 1b @ treat <2^31 as signed
movs r2,#30
subs r2,r1 @ fix exponent
lsls r5,r0,#31 @ one sticky bit
lsrs r0,#1
b packret
@ All the scientific functions are implemented using the CORDIC algorithm. For notation,
@ details not explained in the comments below, and a good overall survey see
@ "50 Years of CORDIC: Algorithms, Architectures, and Applications" by Meher et al.,
@ IEEE Transactions on Circuits and Systems Part I, Volume 56 Issue 9.
@ Register use:
@ r0: x
@ r1: y
@ r2: z/omega
@ r3: coefficient pointer
@ r4,r12: m
@ r5: i (shift)
cordic_start: @ initialisation
movs r5,#0 @ initial shift=0
mov r12,r4
b 5f
cordic_vstep: @ one step of algorithm in vector mode
cmp r1,#0 @ check sign of y
bgt 4f
b 1f
cordic_rstep: @ one step of algorithm in rotation mode
cmp r2,#0 @ check sign of angle
bge 1f
4:
subs r1,r6 @ negative rotation: y=y-(x>>i)
rsbs r7,#0
adds r2,r4 @ accumulate angle
b 2f
1:
adds r1,r6 @ positive rotation: y=y+(x>>i)
subs r2,r4 @ accumulate angle
2:
mov r4,r12
muls r7,r4 @ apply sign from m
subs r0,r7 @ finish rotation: x=x{+/-}(y>>i)
5:
ldmia r3!,{r4} @ fetch next angle from table and bump pointer
lsrs r4,#1 @ repeated angle?
bcs 3f
adds r5,#1 @ adjust shift if not
3:
mov r6,r0
asrs r6,r5 @ x>>i
mov r7,r1
asrs r7,r5 @ y>>i
lsrs r4,#1 @ shift end flag into carry
bx r14
@ CORDIC rotation mode
cordic_rot:
push {r6,r7,r14}
bl cordic_start @ initialise
1:
bl cordic_rstep
bcc 1b @ step until table finished
asrs r6,r0,#14 @ remaining small rotations can be linearised: see IV.B of paper referenced above
asrs r7,r1,#14
asrs r2,#3
muls r6,r2 @ all remaining CORDIC steps in a multiplication
muls r7,r2
mov r4,r12
muls r7,r4
asrs r6,#12
asrs r7,#12
subs r0,r7 @ x=x{+/-}(yz>>k)
adds r1,r6 @ y=y+(xz>>k)
cordic_exit:
pop {r6,r7,r15}
@ CORDIC vector mode
cordic_vec:
push {r6,r7,r14}
bl cordic_start @ initialise
1:
bl cordic_vstep
bcc 1b @ step until table finished
4:
cmp r1,#0 @ continue as in cordic_vstep but without using table; x is not affected as y is small
bgt 2f @ check sign of y
adds r1,r6 @ positive rotation: y=y+(x>>i)
subs r2,r4 @ accumulate angle
b 3f
2:
subs r1,r6 @ negative rotation: y=y-(x>>i)
adds r2,r4 @ accumulate angle
3:
asrs r6,#1
asrs r4,#1 @ next "table entry"
bne 4b
b cordic_exit
.thumb_func
mufp_fsin:
.thumb_func
mufp_fsincos: @ calculate sin and cos using CORDIC rotation method
push {r4,r5,r14}
movs r1,#24
bl mufp_float2fix @ range reduction by repeated subtraction/addition in fixed point
ldr r4,pi_q29
lsrs r4,#4 @ 2pi Q24
1:
subs r0,r4
bge 1b
1:
adds r0,r4
bmi 1b @ now in range 0..2pi
lsls r2,r0,#2 @ z Q26
lsls r5,r4,#1 @ pi Q26 (r4=pi/2 Q26)
ldr r0,=#0x136e9db4 @ initialise CORDIC x,y with scaling
movs r1,#0
1:
cmp r2,r4 @ >pi/2?
blt 2f
subs r2,r5 @ reduce range to -pi/2..pi/2
rsbs r0,#0 @ rotate vector by pi
b 1b
2:
lsls r2,#3 @ Q29
adr r3,tab_cc @ circular coefficients
movs r4,#1 @ m=1
bl cordic_rot
adds r1,#9 @ fiddle factor to make sin(0)==0
movs r2,#0 @ exponents to zero
movs r3,#0
movs r5,#0 @ no sticky bits
bl clampx
bl packx @ pack cosine
bl xchxy
bl clampx
b packretns @ pack sine
.thumb_func
mufp_fcos:
push {r14}
bl mufp_fsin
mov r0,r1 @ extract cosine result
pop {r15}
@ force r0 to lie in range [-1,1] Q29
clampx:
movs r4,#1
lsls r4,#29
cmp r0,r4
bgt 1f
rsbs r4,#0
cmp r0,r4
ble 1f
bx r14
1:
movs r0,r4
bx r14
.thumb_func
mufp_ftan:
push {r4,r5,r6,r14}
bl mufp_fsin @ sine in r0/r2, cosine in r1/r3
b fdiv_n @ sin/cos
.thumb_func
mufp_fexp: @ calculate cosh and sinh using rotation method; add to obtain exp
push {r4,r5,r14}
movs r1,#24
bl mufp_float2fix @ Q24: covers entire valid input range
asrs r1,r0,#16 @ Q8
ldr r2,=#5909 @ log_2(e) Q12
muls r1,r2 @ estimate exponent of result Q20
asrs r1,#19 @ Q1
adds r1,#1 @ rounding
asrs r1,#1 @ rounded estimate of exponent of result
push {r1} @ save for later
lsls r2,r0,#5 @ Q29
ldr r0,=#0x162e42ff @ ln(2) Q29
muls r1,r0 @ accurate contribution of estimated exponent
subs r2,r1 @ residual to be exponentiated, approximately -.5..+.5 Q29
ldr r0,=#0x2c9e15ca @ initialise CORDIC x,y with scaling
movs r1,#0
adr r3,tab_ch @ hyperbolic coefficients
mvns r4,r1 @ m=-1
bl cordic_rot @ calculate cosh and sinh
add r0,r1 @ exp=cosh+sinh
pop {r2} @ recover exponent
b packretns @ pack result
@ keep the old square root function in case we need to save space in future
@ and because it is used by fln
.thumb_func
mufp_fsqrt_ln: @ calculate sqrt and ln using vector method
push {r4,r5,r14}
bl unpackx
movs r1,r0 @ -ve argument?
bmi 3f @ return -Inf, -Inf
ldr r1,=#0x0593C2B9 @ scale factor for CORDIC
bl mul0 @ Q29
asrs r1,r2,#1 @ halve exponent
bcc 1f
adds r1,#1 @ was odd: add 1 and shift mantissa
asrs r0,#1
1:
push {r1} @ save exponent/2 for later
mov r1,r0
ldr r3,=#0x0593C2B9 @ re-use constant
lsls r3,#2
adds r0,r3 @ "a+1"
subs r1,r3 @ "a-1"
movs r2,#0
adr r3,tab_ch @ hyperbolic coefficients
mvns r4,r2 @ m=-1
bl cordic_vec
mov r1,r2 @ keep ln result
pop {r2} @ retrieve exponent/2
2:
movs r3,r2
b packretns @ pack sqrt result
3:
movs r2,#255
b 2b
.thumb_func
mufp_fln:
push {r4,r5,r14}
bl mufp_fsqrt_ln @ get unpacked ln in r1/r3; exponent has been halved
cmp r3,#70 @ ln(Inf)?
bgt 2f @ return Inf
rsbs r3,#0
cmp r3,#70
bgt 1f @ ln(0)? return -Inf
3:
ldr r0,=#0x0162e430 @ ln(4) Q24
muls r0,r3 @ contribution from negated, halved exponent
adds r1,#8 @ round result of ln
asrs r1,#4 @ Q24
subs r0,r1,r0 @ add in contribution from (negated) exponent
movs r2,#5 @ pack expects Q29
b packretns
1:
mvns r0,r0 @ make result -Inf
2:
movs r2,#255
b packretns
.thumb_func
mufp_fatan2:
push {r4,r5,r14}
@ unpack arguments and shift one down to have common exponent
bl unpackx
bl xchxy
bl unpackx
lsls r0,r0,#5 @ Q28
lsls r1,r1,#5 @ Q28
adds r4,r2,r3 @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
asrs r4,#9
adds r4,#1
bmi 2f @ force y to 0 proper, so result will be zero
subs r4,r2,r3 @ calculate shift
bge 1f @ ex>=ey?
rsbs r4,#0 @ make shift positive
asrs r0,r4
cmp r4,#28
blo 3f
asrs r0,#31
b 3f
1:
asrs r1,r4
cmp r4,#28
blo 3f
2:
@ here |x|>>|y| or both x and y are ±0
cmp r0,#0
bge 4f @ x positive, return signed 0
ldr r0,pi_q29 @ x negative, return +/- pi
asrs r1,#31
eors r0,r1
b 7f
4:
asrs r0,r1,#31
b 7f
3:
movs r2,#0 @ initial angle
cmp r0,#0 @ x negative
bge 5f
rsbs r0,#0 @ rotate to 1st/4th quadrants
rsbs r1,#0
ldr r2,pi_q29 @ pi Q29
5:
adr r3,tab_cc @ circular coefficients
movs r4,#1 @ m=1
bl cordic_vec @ also produces magnitude (with scaling factor 1.646760119), which is discarded
mov r0,r2 @ result here is -pi/2..3pi/2 Q29
@ asrs r2,#29
@ subs r0,r2
ldr r2,pi_q29 @ pi Q29
adds r4,r0,r2 @ attempt to fix -3pi/2..-pi case
bcs 6f @ -pi/2..0? leave result as is
subs r4,r0,r2 @ <pi? leave as is
bmi 6f
subs r0,r4,r2 @ >pi: take off 2pi
6:
subs r0,#1 @ fiddle factor so atan2(0,1)==0
7:
movs r2,#0 @ exponent for pack
b packretns
.align 2
.ltorg
@ first entry in following table is pi Q29
pi_q29:
@ circular CORDIC coefficients: atan(2^-i), b0=flag for preventing shift, b1=flag for end of table
tab_cc:
.word 0x1921fb54*4+1 @ no shift before first iteration
.word 0x0ed63383*4+0
.word 0x07d6dd7e*4+0
.word 0x03fab753*4+0
.word 0x01ff55bb*4+0
.word 0x00ffeaae*4+0
.word 0x007ffd55*4+0
.word 0x003fffab*4+0
.word 0x001ffff5*4+0
.word 0x000fffff*4+0
.word 0x0007ffff*4+0
.word 0x00040000*4+0
.word 0x00020000*4+0+2 @ +2 marks end
@ hyperbolic CORDIC coefficients: atanh(2^-i), flags as above
tab_ch:
.word 0x1193ea7b*4+0
.word 0x1193ea7b*4+1 @ repeat i=1
.word 0x082c577d*4+0
.word 0x04056247*4+0
.word 0x0200ab11*4+0
.word 0x0200ab11*4+1 @ repeat i=4
.word 0x01001559*4+0
.word 0x008002ab*4+0
.word 0x00400055*4+0
.word 0x0020000b*4+0
.word 0x00100001*4+0
.word 0x00080001*4+0
.word 0x00040000*4+0
.word 0x00020000*4+0
.word 0x00020000*4+1+2 @ repeat i=12
.align 2
.thumb_func
mufp_fsub:
ldr r2,=#0x80000000
eors r1,r2 @ flip sign on second argument
@ drop into fadd, on .align2:ed boundary
.thumb_func
mufp_fadd:
push {r4,r5,r6,r14}
asrs r4,r0,#31
lsls r2,r0,#1
lsrs r2,#24 @ x exponent
beq fa_xe0
cmp r2,#255
beq fa_xe255
fa_xe:
asrs r5,r1,#31
lsls r3,r1,#1
lsrs r3,#24 @ y exponent
beq fa_ye0
cmp r3,#255
beq fa_ye255
fa_ye:
ldr r6,=#0x007fffff
ands r0,r0,r6 @ extract mantissa bits
ands r1,r1,r6
adds r6,#1 @ r6=0x00800000
orrs r0,r0,r6 @ set implied 1
orrs r1,r1,r6
eors r0,r0,r4 @ complement...
eors r1,r1,r5
subs r0,r0,r4 @ ... and add 1 if sign bit is set: 2's complement
subs r1,r1,r5
subs r5,r3,r2 @ ye-xe
subs r4,r2,r3 @ xe-ye
bmi fa_ygtx
@ here xe>=ye
cmp r4,#30
bge fa_xmgty @ xe much greater than ye?
adds r5,#32
movs r3,r2 @ save exponent
@ here y in r1 must be shifted down r4 places to align with x in r0
movs r2,r1
lsls r2,r2,r5 @ keep the bits we will shift off the bottom of r1
asrs r1,r1,r4
b fa_0
fa_ymgtx:
movs r2,#0 @ result is just y
movs r0,r1
b fa_1
fa_xmgty:
movs r3,r2 @ result is just x
movs r2,#0
b fa_1
fa_ygtx:
@ here ye>xe
cmp r5,#30
bge fa_ymgtx @ ye much greater than xe?
adds r4,#32
@ here x in r0 must be shifted down r5 places to align with y in r1
movs r2,r0
lsls r2,r2,r4 @ keep the bits we will shift off the bottom of r0
asrs r0,r0,r5
fa_0:
adds r0,r1 @ result is now in r0:r2, possibly highly denormalised or zero; exponent in r3
beq fa_9 @ if zero, inputs must have been of identical magnitude and opposite sign, so return +0
fa_1:
lsrs r1,r0,#31 @ sign bit
beq fa_8
mvns r0,r0
rsbs r2,r2,#0
bne fa_8
adds r0,#1
fa_8:
adds r6,r6
@ r6=0x01000000
cmp r0,r6
bhs fa_2
fa_3:
adds r2,r2 @ normalisation loop
adcs r0,r0
subs r3,#1 @ adjust exponent
cmp r0,r6
blo fa_3
fa_2:
@ here r0:r2 is the result mantissa 0x01000000<=r0<0x02000000, r3 the exponent, and r1 the sign bit
lsrs r0,#1
bcc fa_4
@ rounding bits here are 1:r2
adds r0,#1 @ round up
cmp r2,#0
beq fa_5 @ sticky bits all zero?
fa_4:
cmp r3,#254
bhs fa_6 @ exponent too large or negative?
lsls r1,#31 @ pack everything
add r0,r1
lsls r3,#23
add r0,r3
fa_end:
pop {r4,r5,r6,r15}
fa_9:
cmp r2,#0 @ result zero?
beq fa_end @ return +0
b fa_1
fa_5:
lsrs r0,#1
lsls r0,#1 @ round to even
b fa_4
fa_6:
bge fa_7
@ underflow
@ can handle denormals here
lsls r0,r1,#31 @ result is signed zero
pop {r4,r5,r6,r15}
fa_7:
@ overflow
lsls r0,r1,#8
adds r0,#255
lsls r0,#23 @ result is signed infinity
pop {r4,r5,r6,r15}
fa_xe0:
@ can handle denormals here
subs r2,#32
adds r2,r4 @ exponent -32 for +Inf, -33 for -Inf
b fa_xe
fa_xe255:
@ can handle NaNs here
lsls r2,#8
add r2,r2,r4 @ exponent ~64k for +Inf, ~64k-1 for -Inf
b fa_xe
fa_ye0:
@ can handle denormals here
subs r3,#32
adds r3,r5 @ exponent -32 for +Inf, -33 for -Inf
b fa_ye
fa_ye255:
@ can handle NaNs here
lsls r3,#8
add r3,r3,r5 @ exponent ~64k for +Inf, ~64k-1 for -Inf
b fa_ye
.align 2
.thumb_func
mufp_fmul:
push {r7,r14}
mov r2,r0
eors r2,r1 @ sign of result
lsrs r2,#31
lsls r2,#31
mov r14,r2
lsls r0,#1
lsls r1,#1
lsrs r2,r0,#24 @ xe
beq fm_xe0
cmp r2,#255
beq fm_xe255
fm_xe:
lsrs r3,r1,#24 @ ye
beq fm_ye0
cmp r3,#255
beq fm_ye255
fm_ye:
adds r7,r2,r3 @ exponent of result (will possibly be incremented)
subs r7,#128 @ adjust bias for packing
lsls r0,#8 @ x mantissa
lsls r1,#8 @ y mantissa
lsrs r0,#9
lsrs r1,#9
adds r2,r0,r1 @ for later
mov r12,r2
lsrs r2,r0,#7 @ x[22..7] Q16
lsrs r3,r1,#7 @ y[22..7] Q16
muls r2,r2,r3 @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
muls r0,r0,r1 @ result [31..0] Q46
lsrs r2,#18 @ result [45..32] Q14
bcc 1f
cmp r0,#0
bmi 1f
adds r2,#1 @ fix error in r2
1:
lsls r3,r0,#9 @ bits off bottom of result
lsrs r0,#23 @ Q23
lsls r2,#9
adds r0,r2 @ cut'n'shut
add r0,r12 @ implied 1*(x+y) to compensate for no insertion of implied 1s
@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
lsrs r1,r0,#23
bne fm_0 @ branch if we need to shift down one place
@ here 1<=result<2
cmp r7,#254
bhs fm_3a @ catches both underflow and overflow
lsls r3,#1 @ sticky bits at top of R3, rounding bit in carry
bcc fm_1 @ no rounding
beq fm_2 @ rounding tie?
adds r0,#1 @ round up
fm_1:
adds r7,#1 @ for implied 1
lsls r7,#23 @ pack result
add r0,r7
add r0,r14
pop {r7,r15}
fm_2: @ rounding tie
adds r0,#1
fm_3:
lsrs r0,#1
lsls r0,#1 @ clear bottom bit
b fm_1
@ here 1<=result-1<3
fm_0:
adds r7,#1 @ increment exponent
cmp r7,#254
bhs fm_3b @ catches both underflow and overflow
lsrs r0,#1 @ shift mantissa down
bcc fm_1a @ no rounding
adds r0,#1 @ assume we will round up
cmp r3,#0 @ sticky bits
beq fm_3c @ rounding tie?
fm_1a:
adds r7,r7
adds r7,#1 @ for implied 1
lsls r7,#22 @ pack result
add r0,r7
add r0,r14
pop {r7,r15}
fm_3c:
lsrs r0,#1
lsls r0,#1 @ clear bottom bit
b fm_1a
fm_xe0:
subs r2,#16
fm_xe255:
lsls r2,#8
b fm_xe
fm_ye0:
subs r3,#16
fm_ye255:
lsls r3,#8
b fm_ye
@ here the result is under- or overflowing
fm_3b:
bge fm_4 @ branch on overflow
@ trap case where result is denormal 0x007fffff + 0.5ulp or more
adds r7,#1 @ exponent=-1?
bne fm_5
@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
@ so r0 >= 2.FFFFFC (0x17ffffe Q23)