-
Notifications
You must be signed in to change notification settings - Fork 1
/
normal.mac
3935 lines (3690 loc) · 156 KB
/
normal.mac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
; Copyright 1995-2012 - Mersenne Research, Inc. All rights reserved.
; Author: George Woltman
; Email: [email protected]
;
; These macros efficiently implement the normalization to integers
; and multiplication by two-to-phi powers. Normalization generally
; consists of multiplying the data value by two-to-minus-phi. Rounding the
; value to an integer. Making sure the integer is smaller than
; the maximum allowable integer, generating a carry if necessary.
; Finally, the value is multiplied by two-to-phi and stored.
;
; All combinations of the following variations are supported:
; 1) None, 1D-array, 2D-array of two-to-phi multipliers
; 2) With and without maximum convolution error checking
; 3) With and without multiplying by a small constant
; 4) With and without zeroing of upper FFT data values
;
; All macros do eight FFT data values so that some degree of pipelining
; can be achieved.
;
; For 1D macros, these registers are set on input:
; st(2) = sumout
; st(1) = carry #1
; st(0) = carry #2
; esi = pointer to the FFT data values
; ebx = pointer two-to-power multipliers
; edi = big vs. little array ptr
; eax = big vs. little word flag
;
; For 2D macros, these registers are set on input:
; st(5) = sumout
; st(4) = carry
; st(3) = two-to-phi multiplier
; st(2) = two-to-minus-phi group multiplier
; st(1) = two-to-phi group multiplier
; st(0) = two-to-minus-phi multiplier
; esi = pointer to the FFT data values
; ebx = pointer two-to-power column multipliers
; edx = pointer two-to-power group multipliers
; ebp = big vs. little array ptr
; eax = big vs. little word flag
;
; These macros implement the variants of the normalization routines
; in a non-pipelined way. It is simply too much work to hand optimize
; all 24 normalization macros.
;
; Compute the convolution error and if greater than MAXERR, set MAXERR
fmaxp MACRO reg
LOCAL less
IFDEF PFETCH
fcomi st, st(reg) ;; Compare to maximum error
fcmovb st, st(reg) ;; Copy maximum error if it is greater
fxch st(reg) ;; Save the maxerr
fcomp st ;; Pop non-maximum
ELSE
fcom st(reg) ;; Compare to maximum error
pusher eax
fstsw ax ;; Copy comparison results
test ax, 100h ;; Isolate C0 bit
jnz short less ;; Error is less than maximum
fxch st(reg) ;; Save the maxerr
less: fcomp st ;; Pop non-maximum
popper eax
ENDIF
ENDM
brute_force_error_check MACRO reg
fld BIGVAL
fadd st, st(1)
fsub BIGVAL ;; This is the integer value
fsub st, st(1) ;; This is the convolution error
fabs
fmaxp reg+1 ;; Compare to maximum error
ENDM
; Multiply the FFT result by a small constant
brute_force_mul_by_const MACRO
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fmul MULCONST ;; Multiply by the small constant
ENDM
; Zero upper words of FFT
brute_force_zero MACRO
fsub st, st ;; Zero the word
ENDM
; *************** 1D macro ******************
; A pipelined version of this code:
; mov al, [ebp] ;; Load big/lit flag
; fld QWORD PTR [esi+0*8] ;; Load value
; fmul QWORD PTR [ebx+0*16+8] ;; Mul value by two-to-minus-phi
; faddp st(4), st ;; x = value + carry
; fld limit_bigmax[eax*8] ;; Load maximum * BIGVAL - BIGVAL
; fadd st, st(4) ;; y = top bits of x
; fld limit_bigmax[eax*8] ;; Load maximum * BIGVAL - BIGVAL
; fsubr st, st(3) ;; z = y - (maximum * BIGVAL - BIGVAL)
; fmul limit_inverse[eax*8] ;; next carry = shifted y
; fsubp st(6), st ;; rounded value = x - z
; fadd QWORD PTR [esi+0*8] ;; sumout += value
; fmulp QWORD PTR [ebx+0*16] ;; new value = val * two-to-phi
; fstp QWORD PTR [esi+0*8] ;; Save the value
norm_1d MACRO ttp, zero, echk, const
;; c2, c1, sumout, maxerr
ttp mov al, [edi+0] ;; Load big vs. little flags
fld QWORD PTR [esi+0*8] ;; Load values1
fadd st(3), st ;; sumout += values1
fmul QWORD PTR [ebx+0*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 4
const brute_force_mul_by_const
fld QWORD PTR [esi+1*8] ;; Load values2
fadd st(4), st ;; sumout += values2
fmul QWORD PTR [ebx+1*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 5
const brute_force_mul_by_const
fxch st(3) ;; c1, x1, c2, x2, sumout, maxerr
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1, sum, err
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1, sum, err
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1, sum, err
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1, sum, err
ttp fmul QWORD PTR [ebx+0*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+0*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1, sum, err
ttp fmul QWORD PTR [ebx+1*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+1*8] ;; Save new value2
;; c2, c1, sum, err
ttp mov al, [edi+1] ;; Load big vs. little flags
fld QWORD PTR [esi+2*8] ;; Load values1
fadd st(3), st ;; sumout += values1
fmul QWORD PTR [ebx+2*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 4
const brute_force_mul_by_const
fld QWORD PTR [esi+3*8] ;; Load values2
fadd st(4), st ;; sumout += values2
fmul QWORD PTR [ebx+3*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 5
const brute_force_mul_by_const
fxch st(3) ;; c1, x1, c2, x2, sum, err
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1, sum, err
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1, sum, err
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1, sum, err
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1, sum, err
ttp fmul QWORD PTR [ebx+2*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+2*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1, sum, err
ttp fmul QWORD PTR [ebx+3*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+3*8] ;; Save new value2
ttp mov al, [edi+2] ;; Load big vs. little flags
fld QWORD PTR [esi+4*8] ;; Load values1
fadd st(3), st ;; sumout += values1
fmul QWORD PTR [ebx+4*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 4
const brute_force_mul_by_const
fld QWORD PTR [esi+5*8] ;; Load values2
fadd st(4), st ;; sumout += values2
fmul QWORD PTR [ebx+5*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 5
const brute_force_mul_by_const
fxch st(3) ;; c1, x1, c2, x2, sumout, maxerr
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1, sum, err
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1, sum, err
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1, sum, err
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1, sum, err
ttp fmul QWORD PTR [ebx+4*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+4*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1, sum, err
ttp fmul QWORD PTR [ebx+5*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+5*8] ;; Save new value2
;; c2, c1, sum, err
ttp mov al, [edi+3] ;; Load big vs. little flags
fld QWORD PTR [esi+6*8] ;; Load values1
fadd st(3), st ;; sumout += values1
fmul QWORD PTR [ebx+6*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 4
const brute_force_mul_by_const
fld QWORD PTR [esi+7*8] ;; Load values2
fadd st(4), st ;; sumout += values2
fmul QWORD PTR [ebx+7*16] ;; Mul by two-to-minus-phi
echk brute_force_error_check 5
const brute_force_mul_by_const
fxch st(3) ;; c1, x1, c2, x2, sum, err
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1, sum, err
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1, sum, err
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1, sum, err
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1, sum, err
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1, sum, err
ttp fmul QWORD PTR [ebx+6*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+6*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1, sum, err
ttp fmul QWORD PTR [ebx+7*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+7*8] ;; Save new value2
ENDM
; *************** 1D followup macro ******************
; This macro finishes the normalize process by adding the final two
; carries from the first pass back into the lower two data values.
; We take advantage of the fact that the first two-to-phi multiplier
; and the first two-to-minus-phi multiplier are one.
; st(1) = carry #1
; st(0) = carry #2 (wrap around carry)
; esi = pointer to the FFT data values
; edi = big/little array ptr
; ebx = pointer two-to-power multipliers
; eax = big/lit flag
norm012_1d MACRO zero
;; c1, c2
mov al, [edi+0] ;; Load big vs. little flags
fsub BIGVAL ;; Convert wrap-around carry to integer
fld QWORD PTR [esi+1*8] ;; Load values2
fmul QWORD PTR [ebx+1*16] ;; x2 *= two-to-minus-phi
fld QWORD PTR [esi+0*8] ;; Load values1
fadd BIGVAL ;; Compensate for BIGVAL-less carry
fxch st(2) ;; c1, x2, x1, c2
fmul MINUS_C ;; Mul wrap-araound carry by -c
faddp st(2), st ;; x1 = values + carry1
fmul NORM012_FF ;; x2 *= FFTLEN/2K
faddp st(2), st ;; x2 = values + carry2
;; x1, x2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(1) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(3) ;; y2 = top bits of x
;; y2, y1, x1, x2
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x1, x2
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x1, x2
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x1, x2
fsubp st(4), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x1, x2
fsubp st(4), st ;; rounded value = x2 - y2
fxch st(2) ;; x1, c2, c1, x2
fmul QWORD PTR [ebx+0*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+0*8] ;; Save new value1
fxch st(2) ;; x2, c1, c2
no zero fmul QWORD PTR [ebx+1*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+1*8] ;; Save new value2
fxch st(1) ;; c2, c1
mov al, [edi+1] ;; Load big vs. little flags
fld QWORD PTR [esi+2*8] ;; Load values1
fmul QWORD PTR [ebx+2*16] ;; Mul by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fld QWORD PTR [esi+3*8] ;; Load values2
fmul QWORD PTR [ebx+3*16] ;; Mul by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fxch st(3) ;; c1, x1, c2, x2
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1
fmul QWORD PTR [ebx+2*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+2*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1
no zero fmul QWORD PTR [ebx+3*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+3*8] ;; Save new value2
mov al, [edi+2] ;; Load big vs. little flags
fld QWORD PTR [esi+4*8] ;; Load values1
fmul QWORD PTR [ebx+4*16] ;; Mul by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fld QWORD PTR [esi+5*8] ;; Load values2
fmul QWORD PTR [ebx+5*16] ;; Mul by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fxch st(3) ;; c1, x1, c2, x2
faddp st(1), st ;; x1 = values + carry1
fxch st(2) ;; x2, c2, x1
faddp st(1), st ;; x2 = values + carry2
fld LIMIT_BIGMAX[eax] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y1 = top bits of x
fld LIMIT_BIGMAX[eax+8] ;; Load maximum * BIGVAL - BIGVAL
fadd st, st(2) ;; y2 = top bits of x
fld LIMIT_INVERSE[eax] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y1
fld LIMIT_INVERSE[eax+8] ;; Load shifting constant
fmul st, st(2) ;; next carry = shifted y2
fxch st(3) ;; y1, c1, y2, c2, x2, x1
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y2, c1, y1, c2, x2, x1
fsub LIMIT_BIGMAX[eax+8] ;; y2 = y2 - (maximum*BIGVAL-BIGVAL)
fxch st(2) ;; y1, c1, y2, c2, x2, x1
fsubp st(5), st ;; rounded value = x1 - y1
fxch st(1) ;; y2, c1, c2, x2, x1
fsubp st(3), st ;; rounded value = x2 - y2
fxch st(3) ;; x1, c2, x2, c1
fmul QWORD PTR [ebx+4*16+8] ;; mul by two-to-phi
fstp QWORD PTR [esi+4*8] ;; Save new value1
fxch st(1) ;; x2, c2, c1
no zero fmul QWORD PTR [ebx+5*16+8] ;; mul by two-to-phi
zero brute_force_zero
fstp QWORD PTR [esi+5*8] ;; Save new value2
;; c2, c1
no zero fsub BIGVAL ;; Make carry an integer
no zero fmul QWORD PTR [ebx+7*16+8] ;; c1 *= two-to-phi
no zero fadd QWORD PTR [esi+7*8] ;; Load values1
zero brute_force_zero
fstp QWORD PTR [esi+7*8] ;; Save new value1
fsub BIGVAL ;; Make carry an integer
fmul QWORD PTR [ebx+6*16+8] ;; c2 *= two-to-phi
fadd QWORD PTR [esi+6*8] ;; Load values2
fstp QWORD PTR [esi+6*8] ;; Save new value2
ENDM
; This is the normalization routine when we are computing modulo k*2^n+c
; with a zero-padded 2^2n FFT. We do this by multiplying the lower FFT
; word by k and adding in the upper word by -c. Of course, this is made
; very tedious because we have to carefully avoid any loss of precision.
;
; st(3) = MAXERR
; st(2) = sumout
; st(1) = carry #1 (traditional carry)
; st(0) = carry #2 (previous high FFT data - not yet mul'ed by K)
; esi = pointer to the FFT data values
; ebx = pointer two-to-phi multipliers
; edi = pointer to array of big vs. little flags
; eax = big vs. little word flag #1
norm_1d_zpad MACRO ttp, echk, const
;; c2, c1, sumout, maxerr
ttp mov al, [edi] ;; Load big vs. little flags
fld QWORD PTR [esi+0*8] ;; Load v1
fadd st(3), st ;; sumout += v1
fmul QWORD PTR [ebx+0*16] ;; v1 *= two-to-minus-phi
fld QWORD PTR [esi+1*8] ;; Load v2
fadd st(4), st ;; sumout += v2
fmul QWORD PTR [ebx+1*16] ;; v2 *= two-to-minus-phi
fld BIGBIGVAL ;; a1 = big word rounding constant
fld BIGVAL ;; b1 = integer rounding constant
fxch st(4) ;; c2,a1,v2,v1,b1,c1,sumout,maxerr
faddp st(3), st ;; v1 += previous high FFT data (c2)
fadd st, st(2) ;; a1 = a1 + v1 (Round to big word)
fxch st(2) ;; v1,v2,a1,b1,c1,sumout,maxerr
no echk faddp st(3), st ;; b1 += v1 (Round to integer)
no echk fxch st(1) ;; a1,v2,b1,c1,sumout,maxerr
no echk fsub BIGBIGVAL ;; a1 -= big word rounding constant
no echk fxch st(2) ;; b1,v2,a1,c1,sumout,maxerr
no echk fsub BIGVAL ;; b1 -= integer rounding constant
echk fadd st(3), st ;; b1 += v1 (Round to integer)
echk fxch st(2) ;; a1,v2,v1,b1,c1,sumout,maxerr
echk fsub BIGBIGVAL ;; a1 -= big word rounding constant
echk fxch st(3) ;; b1,v2,v1,a1,c1,sumout,maxerr
echk fsub BIGVAL ;; b1 -= integer rounding constant
echk fsub st(2), st ;; v1 -= b1 (convolution error)
echk fxch st(2) ;; v1,v2,b1,a1,c1,sumout,maxerr
echk fabs ;; Compute absolute value
echk fmaxp 6 ;; Compute maximum error
echk fxch st(1) ;; b1,v2,a1,c1,sumout,maxerr
fld BIGBIGVAL ;; a2 = big word rounding constant
fadd st, st(2) ;; a2 = a2 + v2 (Round to big word)
no echk fxch st(2) ;; v2,b1,a2,a1,c1,sumout,maxerr
no echk fadd BIGVAL ;; b2 += v2 (Round to integer)
no echk fxch st(2) ;; a2,b1,b2,a1,c1,sumout,maxerr
no echk fsub BIGBIGVAL ;; a2 -= big word rounding constant
no echk fxch st(2) ;; b2,b1,a2,a1,c1,sumout,maxerr
no echk fsub BIGVAL ;; b2 -= integer rounding constant
echk fld BIGVAL ;; b2 = integer rounding constant
echk fadd st, st(3) ;; b2 += v2 (Round to integer)
echk fxch st(1) ;; a2,b2,b1,v2,a1,c1,sumout,maxerr
echk fsub BIGBIGVAL ;; a2 -= big word rounding constant
echk fxch st(1) ;; b2,a2,b1,v2,a1,c1,sumout,maxerr
echk fsub BIGVAL ;; b2 -= integer rounding constant
echk fsub st(3), st ;; v2 -= b2 (convolution error)
echk fxch st(3) ;; v2,a2,b1,b2,a1,c1,sumout,maxerr
echk fabs ;; Compute absolute value
echk fmaxp 7 ;; Compute maximum error
echk fxch st(2) ;; b2,b1,a2,a1,c1,sumout,maxerr
fxch st(3) ;; a1,b1,a2,b2,c1,sumout,maxerr
fsub st(1), st ;; b1 -= a1 (low bigword bits)
fmul LIMIT_INVERSE[eax] ;; a1 *= shift const (next hi carry)
fxch st(2) ;; a2,b1,a1,b2,c1,sumout,maxerr
fsub st(3), st ;; b2 -= a2 (low bigword bits)
no const fld K_LO ;; x1 = low bits of k
const fld K_TIMES_MULCONST_LO ;; x1 = low bits of k*mulconst
fmul st, st(2) ;; x1 *= b1
fxch st(2) ;; b1,a2,x1,a1,b2,c1,sumout,maxerr
no const fmul K_HI ;; b1 *= high bits of k
const fmul K_TIMES_MULCONST_HI ;; b1 *= high bits of k*mulconst
fxch st(1) ;; a2,b1,x1,a1,b2,c1,sumout,maxerr
no const fmul MINUS_C ;; a2 *= -c
const fmul MINUS_C_TIMES_MULCONST ;; a2 *= -c*mulconst
fxch st(5) ;; c1,b1,x1,a1,b2,a2,sumout,maxerr
faddp st(2), st ;; x1 += carry
fxch st(3) ;; b2,x1,a1,b1,a2,sumout,maxerr
no const fmul MINUS_C ;; b2 *= -c
const fmul MINUS_C_TIMES_MULCONST ;; b2 *= -c*mulconst
fxch st(4) ;; a2,x1,a1,b1,b2,sumout,maxerr
faddp st(3), st ;; b1 += a2 (Add upper FFT word to lower FFT word
faddp st(3), st ;; x1 += b2 (Add upper FFT word to lower FFT word
fld LIMIT_BIGMAX[eax] ;; y1 = Load maximum * BIGVAL - BIGVAL
fadd st, st(3) ;; y1 += x1 (top bits of x1)
fadd st(2), st ;; b1 += y1 (Add in upper mul-by-const bits
fxch st(2) ;; b1,a1,y1,x1,sumout,maxerr
fmul LIMIT_INVERSE[eax] ;; next low carry = shifted b1
fxch st(2) ;; y1,a1,b1,x1,sumout,maxerr
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fsubp st(3), st ;; rounded value = x1 - y1
fxch st(2) ;; x1,b1,a1,sumout,maxerr
ttp fmul QWORD PTR [ebx+0*16+8] ;; new value1 = val * two-to-phi
fstp QWORD PTR [esi+0*8] ;; Save value1
fxch st(1) ;; c2,c1,sumout,maxerr
;; c2, c1, sumout, maxerr
ttp mov al, [edi+1] ;; Load big vs. little flags
fld QWORD PTR [esi+2*8] ;; Load v1
fadd st(3), st ;; sumout += v1
fmul QWORD PTR [ebx+2*16] ;; v1 *= two-to-minus-phi
fld QWORD PTR [esi+3*8] ;; Load v2
fadd st(4), st ;; sumout += v2
fmul QWORD PTR [ebx+3*16] ;; v2 *= two-to-minus-phi
fld BIGBIGVAL ;; a1 = big word rounding constant
fld BIGVAL ;; b1 = integer rounding constant
fxch st(4) ;; c2,a1,v2,v1,b1,c1,sumout,maxerr
faddp st(3), st ;; v1 += previous high FFT data (c2)
fadd st, st(2) ;; a1 = a1 + v1 (Round to big word)
fxch st(2) ;; v1,v2,a1,b1,c1,sumout,maxerr
no echk faddp st(3), st ;; b1 += v1 (Round to integer)
no echk fxch st(1) ;; a1,v2,b1,c1,sumout,maxerr
no echk fsub BIGBIGVAL ;; a1 -= big word rounding constant
no echk fxch st(2) ;; b1,v2,a1,c1,sumout,maxerr
no echk fsub BIGVAL ;; b1 -= integer rounding constant
echk fadd st(3), st ;; b1 += v1 (Round to integer)
echk fxch st(2) ;; a1,v2,v1,b1,c1,sumout,maxerr
echk fsub BIGBIGVAL ;; a1 -= big word rounding constant
echk fxch st(3) ;; b1,v2,v1,a1,c1,sumout,maxerr
echk fsub BIGVAL ;; b1 -= integer rounding constant
echk fsub st(2), st ;; v1 -= b1 (convolution error)
echk fxch st(2) ;; v1,v2,b1,a1,c1,sumout,maxerr
echk fabs ;; Compute absolute value
echk fmaxp 6 ;; Compute maximum error
echk fxch st(1) ;; b1,v2,a1,c1,sumout,maxerr
fld BIGBIGVAL ;; a2 = big word rounding constant
fadd st, st(2) ;; a2 = a2 + v2 (Round to big word)
no echk fxch st(2) ;; v2,b1,a2,a1,c1,sumout,maxerr
no echk fadd BIGVAL ;; b2 += v2 (Round to integer)
no echk fxch st(2) ;; a2,b1,b2,a1,c1,sumout,maxerr
no echk fsub BIGBIGVAL ;; a2 -= big word rounding constant
no echk fxch st(2) ;; b2,b1,a2,a1,c1,sumout,maxerr
no echk fsub BIGVAL ;; b2 -= integer rounding constant
echk fld BIGVAL ;; b2 = integer rounding constant
echk fadd st, st(3) ;; b2 += v2 (Round to integer)
echk fxch st(1) ;; a2,b2,b1,v2,a1,c1,sumout,maxerr
echk fsub BIGBIGVAL ;; a2 -= big word rounding constant
echk fxch st(1) ;; b2,a2,b1,v2,a1,c1,sumout,maxerr
echk fsub BIGVAL ;; b2 -= integer rounding constant
echk fsub st(3), st ;; v2 -= b2 (convolution error)
echk fxch st(3) ;; v2,a2,b1,b2,a1,c1,sumout,maxerr
echk fabs ;; Compute absolute value
echk fmaxp 7 ;; Compute maximum error
echk fxch st(2) ;; b2,b1,a2,a1,c1,sumout,maxerr
fxch st(3) ;; a1,b1,a2,b2,c1,sumout,maxerr
fsub st(1), st ;; b1 -= a1 (low bigword bits)
fmul LIMIT_INVERSE[eax] ;; a1 *= shift const (next hi carry)
fxch st(2) ;; a2,b1,a1,b2,c1,sumout,maxerr
fsub st(3), st ;; b2 -= a2 (low bigword bits)
no const fld K_LO ;; x1 = low bits of k
const fld K_TIMES_MULCONST_LO ;; x1 = low bits of k*mulconst
fmul st, st(2) ;; x1 *= b1
fxch st(2) ;; b1,a2,x1,a1,b2,c1,sumout,maxerr
no const fmul K_HI ;; b1 *= high bits of k
const fmul K_TIMES_MULCONST_HI ;; b1 *= high bits of k*mulconst
fxch st(1) ;; a2,b1,x1,a1,b2,c1,sumout,maxerr
no const fmul MINUS_C ;; a2 *= -c
const fmul MINUS_C_TIMES_MULCONST ;; a2 *= -c*mulconst
fxch st(5) ;; c1,b1,x1,a1,b2,a2,sumout,maxerr
faddp st(2), st ;; x1 += carry
fxch st(3) ;; b2,x1,a1,b1,a2,sumout,maxerr
no const fmul MINUS_C ;; b2 *= -c
const fmul MINUS_C_TIMES_MULCONST ;; b2 *= -c*mulconst
fxch st(4) ;; a2,x1,a1,b1,b2,sumout,maxerr
faddp st(3), st ;; b1 += a2 (Add upper FFT word to lower FFT word
faddp st(3), st ;; x1 += b2 (Add upper FFT word to lower FFT word
fld LIMIT_BIGMAX[eax] ;; y1 = Load maximum * BIGVAL - BIGVAL
fadd st, st(3) ;; y1 += x1 (top bits of x1)
fadd st(2), st ;; b1 += y1 (Add in upper mul-by-const bits
fxch st(2) ;; b1,a1,y1,x1,sumout,maxerr
fmul LIMIT_INVERSE[eax] ;; next low carry = shifted b1
fxch st(2) ;; y1,a1,b1,x1,sumout,maxerr
fsub LIMIT_BIGMAX[eax] ;; y1 = y1 - (maximum*BIGVAL-BIGVAL)
fsubp st(3), st ;; rounded value = x1 - y1
fxch st(2) ;; x1,b1,a1,sumout,maxerr
ttp fmul QWORD PTR [ebx+2*16+8] ;; new value1 = val * two-to-phi
fstp QWORD PTR [esi+2*8] ;; Save value1
fxch st(1) ;; c2,c1,sumout,maxerr
fldz ;; new value2 = zero
fst QWORD PTR [esi+1*8] ;; Zero previous value2
fstp QWORD PTR [esi+3*8] ;; Zero current value2
ENDM
; This macro is similar to norm012_1d, but is for the zero padding case.
; st(1) = carry #1 (traditional carry)
; st(0) = carry #2 (previous high FFT data - not yet mul'ed by K)
; esi = pointer to the FFT data values
; ebp = pointer two-to-power multipliers
; edi = big vs. litle array pointer
; NOTE: If RATIONAL_FFT we could eliminate 8 multiplies.
norm012_1d_zpad MACRO const
LOCAL smallk, mediumk, div_k_done
;; Rather than calculate high FFT carry times k and then later dividing
;; by k, we multiply FFT high carry by const and we'll add it
;; to the lower FFT data later (after multiplying by -c).
const fmul MULCONST
;; Strip BIGVAL from the traditional carry, we'll add the traditional
;; carry in later when we are working on the ZPAD0 - ZPAD6 values.
fxch st(1) ;; c1, c2, sumout, maxerr
fsub BIGVAL ;; Integerize traditional carry
;; Multiply ZPAD0 through ZPAD6 by const * -c. This, in essense,
;; wraps this data from above the FFT data area to the halfway point.
;; Later on we'll divide this by K to decide which data needs wrapping
;; all the way down to the bottom of the FFT data.
;; NOTE that ZPAD0's column multiplier is 1.0. Also, ZPAD6 will not
;; be bigger than a big word. We must be careful to handle c's up
;; to about 30 bits
mov al, [edi] ;; Load big vs. little flags
fld ZPAD0 ;; Load values1
fadd ADDIN_VALUE ;; Add in the requested value
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(1) ;; lowbits,hibits,c1,c2,sumout,maxerr
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in traditional carry
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD0
mov al, [edi+1] ;; Load big vs. little flags
fld ZPAD1 ;; Load values1
fmul QWORD PTR [ebx+1*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(2) ;; hibits0,lobits,hibits,c1,c2,sumout,maxerr
faddp st(1), st ;; Add in prev shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in high part of last calculation
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD1
mov al, [edi+2] ;; Load big vs. little flags
fld ZPAD2 ;; Load values1
fmul QWORD PTR [ebx+2*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(2) ;; hibits0,lobits,hibits,c1,c2,sumout,maxerr
faddp st(1), st ;; Add in prev shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in high part of last calculation
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD2
mov al, [edi+3] ;; Load big vs. little flags
fld ZPAD3 ;; Load values1
fmul QWORD PTR [ebx+3*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(2) ;; hibits0,lobits,hibits,c1,c2,sumout,maxerr
faddp st(1), st ;; Add in prev shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in high part of last calculation
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD3
mov al, [edi+4] ;; Load big vs. little flags
fld ZPAD4 ;; Load values1
fmul QWORD PTR [ebx+4*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(2) ;; hibits0,lobits,hibits,c1,c2,sumout,maxerr
faddp st(1), st ;; Add in prev shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in high part of last calculation
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD4
mov al, [edi+5] ;; Load big vs. little flags
fld ZPAD5 ;; Load values1
fmul QWORD PTR [ebx+5*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(1) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(1), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Saved shifted high ZPAD data
fxch st(2) ;; hibits0,lobits,hibits,c1,c2,sumout,maxerr
faddp st(1), st ;; Add in prev shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(2), st ;; Add in high part of last calculation
fld BIGBIGVAL ;; Big word rounding constant
fadd st, st(2) ;; Round to multiple of big word
fsub BIGBIGVAL
fsub st(2), st ;; Compute low bigword bits
fmul LIMIT_INVERSE[eax] ;; Shift high ZPAD data
fxch st(2) ;; lo(z),hibits,hi(z),c2,sumout,maxerr
fstp ZPAD5
fld ZPAD6 ;; Load values1
fmul QWORD PTR [ebx+6*32] ;; Mul values1 by two-to-minus-phi
fmul NORM012_FF ;; Mul by FFTLEN/2
fadd BIGVAL ;; Round to an integer
fsub BIGVAL
faddp st(1), st ;; Add in shifted high ZPAD data
no const fmul MINUS_C
const fmul MINUS_C_TIMES_MULCONST
faddp st(1), st ;; Add in high part of last calculation
fstp ZPAD6
;; Divide the zpad data by k. Store the integer part in TMP
;; and the remainder in ZPAD0. Later we will wrap the integer part
;; down to the bottom of the FFT data area (and multiply by -c).
;; And we will store the remainder in the upper half of the FFT
;; data area.
;; Note there are three cases to handle. K is smaller than a big word.
;; K is between one and 2 big words in size. And K is more than
;; 2 big words in size.
cmp ZPAD_TYPE, 2 ;; Are we dealing with case 1,2,or 3
jl smallk ;; One word case
je mediumk ;; Two word case
;; This case does the divide by k where k is three words
fld ZPAD6 ;; Load zpad word (high bits)
fld ZPAD5 ;; Load zpad word (middle bits)
fld ZPAD4 ;; Load zpad word (low bits)
fld ZPAD_INVERSE_K6 ;; Load shifted 1/k
fmul st, st(3) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K6_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(4), st ;; Calculate high bits of remainder
fld ZPAD_K6_MID ;; Load middle bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate middle bits of remainder
fld ZPAD_K6_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP5 ;; Save word of zpad / k
fxch st(2) ;; hi,mid,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT6 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD3 ;; Load zpad word (new low bits)
fld ZPAD_SHIFT5 ;; Combine high and medium bits
fmul st, st(2)
fadd st, st(3)
fmul ZPAD_INVERSE_K5 ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K5_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K5_MID ;; Load middle bits of k
fmul st, st(1)
fsubp st(4), st ;; Calculate middle bits of remainder
fld ZPAD_K5_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP4 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,mid,c2,sumout,maxerr
fmul ZPAD_SHIFT5 ;; Shift previous zpad word
faddp st(2), st ;; Add to create new high zpad bits
fld ZPAD2 ;; Load zpad word (new low bits)
fld ZPAD_SHIFT4 ;; Combine high and medium bits
fmul st, st(3)
fadd st, st(2)
fmul ZPAD_INVERSE_K4 ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K4_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(4), st ;; Calculate high bits of remainder
fld ZPAD_K4_MID ;; Load middle bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate middle bits of remainder
fld ZPAD_K4_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP3 ;; Save word of zpad / k
fxch st(2) ;; hi,mid,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT4 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD1 ;; Load zpad word (new low bits)
fld ZPAD_SHIFT3 ;; Combine high and medium bits
fmul st, st(2)
fadd st, st(3)
fmul ZPAD_INVERSE_K3 ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K3_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K3_MID ;; Load middle bits of k
fmul st, st(1)
fsubp st(4), st ;; Calculate middle bits of remainder
fld ZPAD_K3_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP2 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,mid,c2,sumout,maxerr
fmul ZPAD_SHIFT3 ;; Shift previous zpad word
faddp st(2), st ;; Add to create new high zpad bits
fld ZPAD0 ;; Load zpad word (new low bits)
fld ZPAD_SHIFT2 ;; Combine high and medium bits
fmul st, st(3)
fadd st, st(2)
fmul ZPAD_INVERSE_K2 ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K2_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(4), st ;; Calculate high bits of remainder
fld ZPAD_K2_MID ;; Load middle bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate middle bits of remainder
fld ZPAD_K2_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP1 ;; Save word of zpad / k
fxch st(2) ;; hi,mid,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT2 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fmul ZPAD_SHIFT1 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fstp ZPAD0 ;; Save remainder of zpad / k
fldz ;; Zero words that other cases set
fstp TMP6
jmp div_k_done
;; This case does the divide by k where k is two words
mediumk:
fld ZPAD6 ;; Load zpad word (high bits)
fld ZPAD5 ;; Load zpad word (low bits)
fld ZPAD_INVERSE_K6 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K6_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K6_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP6 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT6 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD4 ;; Load zpad word (new low bits)
fld ZPAD_INVERSE_K5 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K5_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K5_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP5 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT5 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD3 ;; Load zpad word (new low bits)
fld ZPAD_INVERSE_K4 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K4_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K4_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP4 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT4 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD2 ;; Load zpad word (new low bits)
fld ZPAD_INVERSE_K3 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K3_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K3_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP3 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT3 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD1 ;; Load zpad word (new low bits)
fld ZPAD_INVERSE_K2 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K2_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K2_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP2 ;; Save word of zpad / k
fxch st(1) ;; hi,lo,c2,sumout,maxerr
fmul ZPAD_SHIFT2 ;; Shift previous zpad word
faddp st(1), st ;; Add to create new high zpad bits
fld ZPAD0 ;; Load zpad word (new low bits)
fld ZPAD_INVERSE_K1 ;; Load shifted 1/k
fmul st, st(2) ;; Mul ZPAD by shifted 1/k
fadd BIGVAL ;; Round to integer
fsub BIGVAL
fld ZPAD_K1_HI ;; Load high bits of k
fmul st, st(1)
fsubp st(3), st ;; Calculate high bits of remainder
fld ZPAD_K1_LO ;; Load low bits of k
fmul st, st(1)
fsubp st(2), st ;; Calculate low bits of remainder
fstp TMP1 ;; Save word of zpad / k