-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhgpass2.mac
5338 lines (4772 loc) · 160 KB
/
hgpass2.mac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
; Copyright 2001-2010 - Mersenne Research, Inc. All rights reserved
; Author: George Woltman
; Email: [email protected]
;
; These macros take the basic XMM FFT building blocks and build even
; larger building blocks used in the final pass of a multi-pass FFT.
;
; ********************************************************
; ********************************************************
; ******************* PASS 2 MACROS ********************
; ********************************************************
; ********************************************************
; The goal of this pass is to perform the last FFT levels.
;
; To reduce the amount of sine-cosine data we read in from
; main memory, we pre-multiply each set of values by a different
; set of sine-cosine values and then perform a standard 11-level
; FFT on each 2 sets of 2048 complex values. The root of this idea
; comes from articles by David Bailey available on the web.
dist128 = (128*64+128) ; Distance between 128 cache lines
; *************** xmm-pass2-8-levels-real macro ******************
; This macro takes 256 real values, 256 semi-real values and the
; first 256 complex values and performs the final 8 levels of the
; FFT process, squares the results, and does the first 8 levels of
; the inverse FFT.
hg_pass2_8_levels_real MACRO
LOCAL b0d, b1b, b2b, b3b, b4b, b5b, b6b, b7b, b8b, b9b
LOCAL baa, bab, bac, bad
LOCAL c0b, c1b, c2b, c3b, c4b, c5b, c6b, c7b, c8b
LOCAL xpass2_8_levels_real_1, xpass2_8_levels_real_2
LOCAL xpass2_8_levels_real_3, xpass2_8_levels_real_4
LOCAL xpass2_8_real_unfft, xpass2_8_real_done
;; We switch to the same format used in one-pass FFTs (that is, the
;; high word of xmm registers being 1 greater than the low word):
;; Load the TLBs for this data set and the next data set.
;; This data set is spread over 2 pages (8KB), the next data set also
;; takes 2 pages (8KB), and we touch one more page because the data
;; sets do not start on exact on 4KB boudaries.
start_timer 2
mov rsi, DATA_ADDR ;; Load source address
xtouch [rsi] ;; Read from this page (loads the TLB)
xtouch [rsi+4096] ;; Next page
xtouch [rsi+2*4096-128] ;; Next page
mov rcx, DATA_PREFETCH ;; Prefetch pointer
xtouch [rcx] ;; Read from this page (loads the TLB)
xtouch [rcx+4096] ;; Next page
xtouch [rcx+2*4096-128] ;; Next page
;; Type 4 FFTs skip the forward FFT process
mov rbx, DIST_TO_FFTSRCARG
cmp ffttype, 4
je xpass2_8_levels_real_4
;; Do FFT level 1
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +128 256 +128 512 +128 768 +128
;; 1 ...
;; ...
;; 127 ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 128 129 256 257 384 385
;; +512 ...
;; 2 ...
;; ...
;; 126 ...
;; +512 ...
;; Do eight_reals_fft_1 on real values 0 - 255
;; Do nothing on semi-real values from 256 - 511
;; Do two_complex_fft on complex values from 512 - 1023
;; distance between fft data elements is 128
loops_init 64 ;; Load loop counter
b1b: s2cl_eight_reals_fft_1 rsi, 2*64, 64
L2prefetch128 [rsi+3*128] ;; Prefetch a few iterations ahead
loops 64, b1b ;; Test loop counter
bump rsi, -64*2*64 ;; Restore source pointer
;; Do FFT levels 2,3
;; Values 0-127 is real data, 128-255 is semi-real data, 256-1023 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 1 128 129 256 257 384 385
;; +512 ...
;; 2 ...
;; ...
;; 126 ...
;; +512 ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 32 33 64 65 96 97
;; +512 ...
;; 2 ...
;; ...
;; 30 ...
;; +512 ...
;; 128 ...
;; ...
;; Do 16 eight_reals_fft_2 macros
;; Do 16 nop_two_two_complex_fft_2 macros
;; Do 32 four_complex_fft macros
;; distance between fft data elements is 32
;; do 16 macros each processing 32 data values
mov rdi, sincos10 ;; Load sin/cos pointer
loops_init 16
b2b: x4cl_eight_reals_fft_2 rsi, 2*64, 32*64, 64*64, rdi
loops 16, b2b ;; Test inner loop counter
;; Do 64 four_complex_fft macros
;; distance between fft data elements is 32
;; do 16 macros each processing 32 data values
bump rsi, -16*2*64+64 ;; Next source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init_prefetch 16, 128, 1, rcx
b3b: x4cl_four_complex_fft rsi, 2*64, 32*64, 64*64, rdi
loops 16, b3b ;; Test inner loop counter
bump rsi, -16*2*64-64 ;; Restore source pointer
;; Do FFT levels 4
;; Values 0-31 is real data, 32-63 is semi-real data, 64-1023 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 1 32 33 64 65 96 97
;; +512 ...
;; 2 ...
;; ...
;; 30 ...
;; +512 ...
;; 128 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 16 17 32 33 48 49
;; +512 ...
;; 2 ...
;; ...
;; 14 ...
;; +512 ...
;; 64 ...
;; ...
;; Do 16 eight_reals_fft_1 macros
;; distance between fft data elements is 16
;; do 8 macros each processing 16 data values
mov rdi, sincos9 ;; Load sin/cos pointer
loops_init 64
b4b: x2cl_eight_reals_fft_1 rsi, 2*64, 16*64
loops 8, b4b ;; Test inner loop counter
;; Do 112 two_complex_fft macros
;; distance between fft data elements is 64
;; do 56 macros each processing 16 data values
bump rsi, -8*2*64+32*64 ;; Load source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_reset ;; 3 then 4 iterations of 4
b5b: x2cl_two_complex_fft rsi, 2*64, 16*64, rdi
loops 8, b5b ;; Test inner loop counter
bump rsi, -8*2*64+32*64 ;; Next source pointer
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 4, b5b ;; Test loop counter
bump rsi, -4*32*64+64 ;; Next source pointer
loops 2, b5b ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do FFT levels 5,6
;; Values 0-15 is real data, 16-31 is semi-real data, 32-1023 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 1 16 17 32 33 48 49
;; +512 ...
;; 2 ...
;; ...
;; 14 ...
;; +512 ...
;; 64 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 4 5 8 9 12 13
;; +512 ...
;; 2 ...
;; +512 ...
;; 16 ...
;; ...
;; Do 2 eight_reals_fft_2 macros
;; Do 2 nop_two_two_complex_fft_2 macros
;; Do 4 four_complex_fft macros
;; distance between fft data elements is 4
;; do 2 macros each processing 32 data values
mov rdi, sincos7 ;; Load sin/cos pointer
loops_init 32
b8b: x4cl_eight_reals_fft_2 rsi, 2*64, 4*64, 8*64, rdi
loops 2, b8b ;; Test loop counter
;; Do 120 four_complex_fft macros
;; distance between fft data elements is 4
;; do 30 macros each processing 32 data values
bump rsi, -2*2*64+16*64 ;; Load source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_reset ;; 7 then 8 iterations of 2
b9b: x4cl_four_complex_fft rsi, 2*64, 4*64, 8*64, rdi
loops 2, b9b ;; Test loop counter
bump rsi, -2*2*64+16*64 ;; Next source pointer
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b9b ;; Test middle loop counter
bump rsi, -8*16*64+64 ;; Next source pointer
loops 2, b9b ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do FFT levels 7,8
;; Values 0-3 is real data, 4-7 is semi-real data, 8-1023 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 1 4 5 8 9 12 13
;; +512 ...
;; 2 ...
;; +512 ...
;; 16 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 2 3 4 5 6 7
;; +512 ...
;; 8 ...
;; ...
;; Do 1/2 eight_reals_fft_2 macros
;; Do 1/2 nop_two_two_complex_fft_2 macros
;; Do 1 four_complex_fft macros
;; distance between fft data elements is 1
;; Do 126 more four_complex_fft macros
;; distance between fft data elements is 1
;; do 63 macros each processing 16 data values
mov rdi, sincos6 ;; Load sin/cos pointer
;; Execute the proper middle step
cmp ffttype, 2
jl xpass2_8_levels_real_1
jg xpass2_8_levels_real_3
xpass2_8_levels_real_2:
s2cl_eight_reals_with_square_2 rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init 64, 1, 32 ;; 31 then 32 iterations
baa: s2cl_four_complex_with_square rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 32, baa ;; Test loop counter
bump rsi, -32*4*64+64 ;; Next source pointer
loops 2, baa ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
jmp xpass2_8_real_unfft
xpass2_8_levels_real_1:
s2cl_eight_reals_fft_2_final rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init 64, 1, 32 ;; 31 then 32 iterations
bab: s2cl_four_complex_fft_final rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 32, bab ;; Test loop counter
bump rsi, -32*4*64+64 ;; Next source pointer
loops 2, bab ;; Test outer loop counter
jmp xpass2_8_real_done
xpass2_8_levels_real_3:
mov rbp, DIST_TO_MULSRCARG
s2cl_eight_reals_with_mult_2 rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init 64, 1, 32 ;; 31 then 32 iterations
bac: s2cl_four_complex_with_mult rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 32, bac ;; Test loop counter
bump rsi, -32*4*64+64 ;; Next source pointer
loops 2, bac ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
jmp xpass2_8_real_unfft
xpass2_8_levels_real_4:
mov rbp, DIST_TO_MULSRCARG
mov rdi, sincos6 ;; Load sin/cos pointer
s2cl_eight_reals_with_mulf_2 rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init 64, 1, 32 ;; 31 then 32 iterations
bad: s2cl_four_complex_with_mulf rsi, 4*64, 2*64
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 32, bad ;; Test loop counter
bump rsi, -32*4*64+64 ;; Next source pointer
loops 2, bad ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do inverse FFT levels 5,6
;; On input the 64-byte cache lines hold these data values:
;; 0 1 2 3 4 5 6 7
;; +512 ...
;; 8 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 8 9 16 17 24 25
;; +512 ...
;; 2 ...
;; ...
;; 6 ...
;; +512 ...
;; 32 ...
;; ...
;; Do 2 eight_reals_unfft_2 macros
;; Do 2 nop_two_two_complex_unfft_2 macros
;; distance between fft data elements is 4
;; do 1 macro processing 32 data values
xpass2_8_real_unfft:
x4cl_eight_reals_unfft_2 rsi, 8*64, 2*64, 4*64
;; Do 124 four_complex_unfft macros
;; distance between fft data elements is 4
;; do 31 macros each processing 32 data values
mov rdi, sincos7 ;; Load sin/cos pointer
loops_init 32, 1, 16 ;; 15 then 16 iterations
c8b: x4cl_four_complex_unfft rsi, 8*64, 2*64, 4*64, rdi
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 16, c8b ;; Test loop counter
bump rsi, -16*8*64+64 ;; Next source pointer
loops 2, c8b ;; Test loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do inverse FFT levels 4
;; On input the 64-byte cache lines hold these data values:
;; 0 1 8 9 16 17 24 25
;; +512 ...
;; 2 ...
;; ...
;; 6 ...
;; +512 ...
;; 32 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 16 17 32 33 48 49
;; +512 ...
;; 2 ...
;; ...
;; 14 ...
;; +512 ...
;; 64 ...
;; ...
;; Do 8 eight_reals_unfft_1 macros
;; distance between fft data elements is 16
;; do 4 macros each processing 16 data values
loops_init 64
c4b: x2cl_eight_reals_unfft_1 rsi, 2*64, 8*64
loops 4, c4b ;; Test inner loop counter
;; Do 960 two_complex_unfft macros
;; distance between fft data elements is 16
;; do 240 macros each processing 32 data values
bump rsi, -4*2*64+16*64 ;; Load source pointer
mov rdi, sincos9 ;; Load sin/cos pointer
loops_reset ;; 7 then 8 iterations of 4
c5b: x2cl_two_complex_unfft rsi, 2*64, 8*64
loops 4, c5b ;; Test inner loop counter
bump rsi, -4*2*64+16*64 ;; Next source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 8, c5b ;; Test outer loop counter
bump rsi, -8*16*64+64 ;; Next source pointer
loops 2, c5b ;; Test loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do inverse FFT levels 2,3
;; On input the 64-byte cache lines hold these data values:
;; 0 1 16 17 32 33 48 49
;; +512 ...
;; 2 ...
;; ...
;; 14 ...
;; +512 ...
;; 64 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 1 64 65 128 129 192 193
;; +512 ...
;; 2 ...
;; ...
;; 62 ...
;; +512 ...
;; 256 ...
;; ...
;; Do 128 eight_reals_unfft_2 macros
;; Do 128 nop_two_two_complex_unfft_2 macros
;; distance between fft data elements is 256
;; do 8 macros each processing 32 data values
loops_init 32
c2b: x4cl_eight_reals_unfft_2 rsi, 2*64, 16*64, 32*64
loops 8, c2b ;; Test inner loop counter
;; Do 96 four_complex_unfft macros
;; distance between fft data elements is 32
;; do 24 macros each processing 32 data values
bump rsi, -8*2*64+64*64 ;; Next source pointer
mov rdi, sincos10 ;; Load sin/cos pointer
loops_reset ;; 1 then 2 iterations of 64
c3b: x4cl_four_complex_unfft rsi, 2*64, 16*64, 32*64, rdi
loops 8, c3b ;; Test inner loop counter
bump rsi, -8*2*64+64*64 ;; Next source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 2, c3b ;; Test loop counter
bump rsi, -2*64*64+64 ;; Next source pointer
loops 2, c3b ;; Test loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do inverse FFT level 1
;; On input the 64-byte cache lines hold these data values:
;; 0 1 64 65 128 129 192 193
;; +512 ...
;; 2 ...
;; ...
;; 62 ...
;; +512 ...
;; 256 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +128 256 +128 512 +128 768 +128
;; 1 ...
;; ...
;; 127 ...
;; Do 64 eight_reals_last_unfft macros
;; Do 64 two_two_complex_unfft macros
;; distance between fft data elements is 128
;; do 32 macros each processing 16 data values
loops_init 32
c1b: s4cl_eight_reals_unfft_1 rsi, 2*64, 64*64, 64
loops 32, c1b ;; Test loop counter
xpass2_8_real_done:
end_timer 2
ENDM
; *************** xmm-pass2-8-levels-complex macro ******************
; This macro takes 2 groups of 256 complex values, premultiplies them and
; performs the final 8 levels of the FFT process, squares the
; results, and does the first 8 levels of the inverse FFT.
; NOTE: Rather than remembering 256 multipliers we break them up
; into 64 groups of 4 columns. By remembering these 68 values we can compute
; all 256 multipliers using a single complex multiply. These extra
; complex multiplies can probably be done faster than the computer can read the
; extra multipliers from memory (and are more likely stay in the L2 cache)
hg_pass2_8_levels_complex MACRO
LOCAL b1b, b2b, b5b
LOCAL b6a0, b6a, b6b0, b6b, b6c0, b6c, b6d0, b6d
LOCAL c1b, c2b, c5b
LOCAL xpass2_8_levels_complex_1, xpass2_8_levels_complex_2
LOCAL xpass2_8_levels_complex_3, xpass2_8_levels_complex_4
LOCAL xpass2_8_levels_complex_unfft, xpass2_8_levels_complex_done
;; Load the TLBs for this data set and the next data set.
;; This data set is spread over 2 pages (8KB), the next data set also
;; takes 2 pages (8KB), and we touch one more page because the data
;; sets do not start on exact on 4KB boudaries. Also, preload TLBs for
;; premultiplier data.
start_timer 3
mov rcx, DATA_PREFETCH ;; Prefetch pointer
xtouch [rcx] ;; Read from this page (loads the TLB)
xtouch [rcx+4096] ;; Next page
xtouch [rcx+2*4096-128] ;; Next page
mov rcx, PREMULT_PREFETCH ;; Prefetch pointer
xtouch [rcx] ;; Load premultiplier TLBs
xtouch [rcx+4096-128] ;; Load premultiplier TLBs
end_timer 3
;; Type 4 FFTs skip the forward FFT process
mov rsi, DATA_ADDR ;; Load address of FFT data
mov rbx, DIST_TO_FFTSRCARG
cmp ffttype, 4
je xpass2_8_levels_complex_4
;; Do FFT level 1,2
;;
;; We swizzle the data so that one set of data is in the high word of
;; an xmm register, the other set is in the low word. This means
;; a distance of 4096 data values.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +128 256 +128 512 +128 768 +128
;; 1 ...
;; ...
;; 127
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 64 +512 128 +512 192 +512
;; 1 ...
;; ...
;; 63 ...
;; 256 ...
;; ...
;; Do 128 four_complex_gpm_fft macros
;; distance between fft data elements is 64
;; do 64 macros each processing 16 data values
start_timer 4
mov rdi, PREMULT_ADDR ;; The group multipliers
loops_init 64 ;; 16 iterations of 4
b1b: s2cl_four_complex_gpm_fft rsi, 64, 64*64
L2prefetch128 [rsi+5*64] ;; Prefetch a few iterations ahead in
L2prefetch128 [rsi+64*64+5*64] ;; case they are not in the L2 cache
loops 4, b1b ;; Test inner loop counter
bump rdi, XMM_PMD ;; Next group multiplier
L2prefetch128 [rdi+4*XMM_PMD]
L2prefetch128 [rdi+16*XMM_PMD]
loops 16, b1b ;; Test outer loop counter
bump rsi, -64*64 ;; Restore source pointer
end_timer 4
;; Do FFT levels 3,4
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 64 +512 128 +512 192 +512
;; 1 ...
;; ...
;; 63 ...
;; 256 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 16 +512 32 +512 48 +512
;; 1 ...
;; ...
;; 15 ...
;; 64 ...
;; ...
;; Do 4 groups of four_complex_fft_cpm macros
;; To make things run faster we precompute the column multipliers times the
;; 4 different sine/cosine multipliers we run into in this section. This
;; costs some memory but saves us some complex multiplies.
;; Do 128 four_complex_cpm_fft macros
;; distance between fft data elements is 16
;; do 16 macros each processing 64 data values
start_timer 5
mov rcx, PREMULT_PREFETCH ;; Prefetch pointer
loops_init_prefetch 32, 128, 1, rcx ;; 2 iters of 4 iterations of 4
b2b: x4cl_four_complex_cpm_fft rsi, 64, 16*64, 32*64, 4*XMM_PMD
bump rdi, XMM_PMD ;; Next column premultiplier
loops 4, b2b ;; Test inner loop counter
bump rdi, -4*XMM_PMD ;; Restore column premultiplier
loops 4, b2b ;; Test loop counter
bump rsi, -16*64+64*64 ;; Next source pointer
bump rdi, 8*XMM_PMD ;; Next set of premultipliers
loops 2, b2b ;; Test outer loop counter
bump rsi, -2*64*64 ;; Restore source pointer
end_timer 5
;; Do FFT levels 5,6
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 16 +512 32 +512 48 +512
;; 1 ...
;; ...
;; 15 ...
;; 64 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 4 +512 8 +512 12 +512
;; 1 ...
;; ...
;; 3 ...
;; 16 ...
;; ...
;; Do 128 four_complex_fft macros
;; distance between fft data elements is 4
;; do 32 macros each processing 32 data values
start_timer 8
mov rcx, DATA_PREFETCH ;; Prefetch pointer
mov rdi, xsincos_complex ;; Load sin/cos pointer
loops_init_prefetch 32, 64, 1, rcx ;; 8 iterations of 4, prefetch 64 bytes every macro
b5b: x4cl_four_complex_fft rsi, 64, 4*64, 8*64, rdi
loops 4, b5b ;; Loop 4 times
bump rsi, -4*64+16*64 ;; Next source pointer
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b5b ;; Test outer loop counter
bump rsi, -8*16*64 ;; Restore source pointer
end_timer 8
;; Do FFT levels 7,8 as well as inverse FFT levels 7,8
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 4 +512 8 +512 12 +512
;; 1 ...
;; ...
;; 3 ...
;; 16 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 2 +512 4 +512 6 +512
;; 1 ...
;; 8 ...
;; ...
;; Do 128 four_complex_fft macros
;; distance between fft data elements is 1
;; do 32 macros each processing 32 data values
;; Execute the right middle step
cmp ffttype, 2
jl xpass2_8_levels_complex_1
jg xpass2_8_levels_complex_3
xpass2_8_levels_complex_2:
start_timer 9
loops_init_prefetch 32, 64, 1, rcx
b6a0: mov rdi, xsincos_complex ;; Load sin/cos pointer
b6a: x4cl_four_complex_with_square rsi, 4*64, 64, 2*64
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b6a ;; Loop 8 times
loops 4, b6a0 ;; Test loop counter
end_timer 9
bump rsi, -32*4*64 ;; Restore source pointer
jmp xpass2_8_levels_complex_unfft
xpass2_8_levels_complex_1:
start_timer 9
loops_init_prefetch 32, 128, 1, rcx
b6b0: mov rdi, xsincos_complex ;; Load sin/cos pointer
b6b: x4cl_four_complex_fft_final rsi, 4*64, 64, 2*64
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b6b ;; Test loop counter
loops 4, b6b0 ;; Test loop counter
end_timer 9
jmp xpass2_8_levels_complex_done
xpass2_8_levels_complex_3:
start_timer 9
mov rbp, DIST_TO_MULSRCARG
loops_init_prefetch 32, 64, 1, rcx
b6c0: mov rdi, xsincos_complex ;; Load sin/cos pointer
b6c: x4cl_four_complex_with_mult rsi, 4*64, 64, 2*64
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b6c ;; Loop 8 times
loops 4, b6c0 ;; Test loop counter
end_timer 9
bump rsi, -32*4*64 ;; Restore source pointer
jmp xpass2_8_levels_complex_unfft
xpass2_8_levels_complex_4:
start_timer 9
mov rbp, DIST_TO_MULSRCARG
mov rcx, DATA_PREFETCH ;; Prefetch pointer
loops_init_prefetch 32, 128, 1, rcx
b6d0: mov rdi, xsincos_complex ;; Load sin/cos pointer
b6d: x4cl_four_complex_with_mulf rsi, 4*64, 64, 2*64
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 8, b6d ;; Test loop counter
loops 4, b6d0 ;; Test loop counter
end_timer 9
bump rsi, -32*4*64 ;; Restore source pointer
;; Do inverse FFT levels 5,6
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 2 +512 4 +512 6 +512
;; 1 ...
;; 8 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 8 +512 16 +512 24 +512
;; 1 ...
;; ...
;; 7 ...
;; 32 ...
;; ...
;; Do 128 four_complex_unfft macros
;; distance between fft data elements is 4
;; do 32 macros each processing 32 data values
xpass2_8_levels_complex_unfft:
start_timer 10
mov rdi, xsincos_complex ;; Load sin/cos pointer
loops_init_prefetch 32, 64, 1, rcx
c5b: x4cl_four_complex_unfft rsi, 64, 2*64, 4*64, rdi
loops 2, c5b ;; Test inner loop counter
bump rsi, -2*64+8*64 ;; Next source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops 16, c5b ;; Test loop counter
bump rsi, -16*8*64 ;; Restore source pointer
end_timer 10
;; Do inverse FFT levels 3,4
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 8 +512 16 +512 24 +512
;; 1 ...
;; ...
;; 7 ...
;; 32 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +512 32 +512 64 +512 96 +512
;; 1 ...
;; ...
;; 31 ...
;; 128 ...
;; ...
;; Do 4 groups of 32 four_complex_fft macros (FFT levels 3,4)
;; In this section we also apply the 4 column multipliers (actually three
;; since the first column multiplier is always one). To make things
;; run faster we precompute the column multipliers times the 4 different
;; sine/cosine multipliers we run into in this section. This costs some
;; memory but saves us some complex multiplies.
;; Do 128 four_complex_cpm_unfft macros
;; distance between fft data elements is 16
;; do 8 macros each processing 128 data values
start_timer 13
mov rdi, PREMULT_ADDR ;; Load column multipliers pointer
bump rdi, 16*XMM_PMD ;; Load column multipliers pointer
loops_init_prefetch 32, 64, 1, rcx ;; 4 iters of 2 iterations of 4
c2b: x4cl_four_complex_cpm_unfft rsi, 64, 8*64, 16*64
bump rdi, XMM_PMD ;; Next column premultiplier
loops 4, c2b ;; Loop 4 times
bump rdi, -4*XMM_PMD ;; Reset premultiplier pointer
loops 2, c2b ;; Reset counters, test loop counter
bump rsi, -8*64+32*64 ;; Next source pointer
bump rdi, 4*XMM_PMD ;; Next premultiplier pointer
loops 4, c2b ;; Test outer loop counter
bump rsi, -4*32*64 ;; Restore source pointer
end_timer 13
;; Do inverse FFT levels 1,2
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 32 +512 64 +512 96 +512
;; 1 ...
;; ...
;; 31 ...
;; 128 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +128 256 +128 512 +128 768 +128
;; 1 ...
;; ...
;; 255
;; Do 32 four_complex_gpm_unfft macros
;; distance between fft data elements is 64
;; do 32 macros each processing 32 data values
start_timer 14
mov rcx, PREMULT_PREFETCH ;; Prefetch pointer
mov rdi, PREMULT_ADDR ;; Load group multiplier pointer
loops_init_prefetch 32, 64, 1, rcx ;; 8 iterations of 4
c1b: s4cl_four_complex_gpm_unfft rsi, 64, 32*64, 64*64, 8*XMM_PMD
loops 4, c1b ;; Test inner loop counter
bump rdi, XMM_PMD ;; Next set of multipliers
loops 8, c1b ;; Test outer loop counter
end_timer 14
xpass2_8_levels_complex_done:
ENDM
; *************** xmm-pass2-10-levels-real macro ******************
; This macro takes 1024 real values, 1024 semi-real values and the
; first 1024 complex values and performs the final 10 levels of the
; FFT process, squares the results, and does the first 10 levels of
; the inverse FFT.
hg_pass2_10_levels_real MACRO
LOCAL b0d, b1b, b2b, b3b, b4b, b5b, b6b, b7b, b8b, b9b
LOCAL baa0, baa, bab0, bab, bac0, bac, bad0, bad
LOCAL c0b, c1b, c2b, c3b, c4b, c5b, c6b, c7b, c8b
LOCAL xpass2_10_levels_real_1, xpass2_10_levels_real_2
LOCAL xpass2_10_levels_real_3, xpass2_10_levels_real_4
LOCAL xpass2_10_real_unfft, xpass2_10_real_done
;; We switch to the same format used in one-pass FFTs (that is, the
;; high word of xmm registers being 1 greater than the low word):
;; Type 4 FFTs skip the forward FFT process
start_timer 2
mov rsi, DATA_ADDR ;; Load source address
mov rbx, DIST_TO_FFTSRCARG
cmp ffttype, 4
je xpass2_10_levels_real_4
;; Do FFT level 1
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +512 1024 +512 2048 +512 3072 +512
;; 1 ...
;; ...
;; 511 ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +1 512 +1 1024 +1 1536 +1
;; +2048 ...
;; 2 ...
;; ...
;; 510 ...
;; +2048 ...
;; Do eight_reals_fft_1 on real values 0 - 1023
;; Do nothing on semi-real values from 1024 - 2047
;; Do two_complex_fft on complex values from 2048 - 4095
;; distance between fft data elements is 512
;; do 256 iterations
loops_init 256
b1b: s2cl_eight_reals_fft_1 rsi, 2*64, 64
L2prefetch128 [rsi+3*128] ;; Prefetch a few iterations ahead
loops 64, b1b ;; Test loop counter
bump rsi, -64*2*64+dist128 ;; Next source pointer
loops 4, b1b ;; Test loop counter
bump rsi, -4*dist128 ;; Restore source pointer
;; Do FFT levels 2,3
;; Values 0-511 is real data, 512-1023 is semi-real data, 1024-4095 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +1 512 +1 1024 +1 1536 +1
;; +2048 ...
;; 2 ...
;; ...
;; 510 ...
;; +2048 ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +1 128 +1 256 +1 384 +1
;; +2048 ...
;; 2 ...
;; ...
;; 126 ...
;; +2048 ...
;; 512 ...
;; ...
;; Do 64 eight_reals_fft_2 macros
;; Do 64 nop_two_two_complex_fft_2 macros
;; Do 128 four_complex_fft macros
;; distance between fft data elements is 128
;; do 64 macros each processing 32 data values
mov rdi, sincos10 ;; Load sin/cos pointer
mov rcx, DATA_PREFETCH ;; Prefetch pointer
loops_init 64
b2b: x4cl_eight_reals_fft_2 rsi, 2*64, dist128, 2*dist128, rdi
loops 64, b2b ;; Test loop counter
;; Do 256 four_complex_fft macros
;; distance between fft data elements is 128
;; do 64 macros each processing 32 data values
bump rsi, -64*2*64+64 ;; Next source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_init_prefetch 64, 128, 1, rcx
xtouch [rcx] ;; Preload first TLB
xtouch [rcx+4096-128] ;; Preload the TLBs
xtouch [rcx+2*4096-128]
b3b: x4cl_four_complex_fft rsi, 2*64, dist128, 2*dist128, rdi
loops 64, b3b ;; Test loop counter
bump rsi, -64*2*64-64 ;; Restore source pointer
bump rcx, 128 ;; Skip pad bytes every 8KB
;; Do FFT levels 4
;; Values 0-127 is real data, 128-255 is semi-real data, 256-4095 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +1 128 +1 256 +1 384 +1
;; +2048 ...
;; 2 ...
;; ...
;; 126 ...
;; +2048 ...
;; 512 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +1 64 +1 128 +1 192 +1
;; +2048 ...
;; 2 ...
;; ...
;; 62 ...
;; +2048 ...
;; 256 ...
;; ...
;; Do 64 eight_reals_fft_1 macros
;; distance between fft data elements is 64
;; do 32 macros each processing 16 data values
mov rdi, sincos9 ;; Load sin/cos pointer
loops_init 256
b4b: x2cl_eight_reals_fft_1 rsi, 2*64, 64*64
loops 32, b4b ;; Test loop counter
;; Do 448 two_complex_fft macros
;; distance between fft data elements is 64
;; do 224 macros each processing 16 data values
bump rsi, -32*2*64+dist128 ;; Load source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_reset ;; 3 then 4 iterations of 32
b5b: x2cl_two_complex_fft rsi, 2*64, 64*64, rdi
loops 32, b5b ;; Test loop counter
bump rsi, -32*2*64+dist128 ;; Next source pointer
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 4, b5b ;; Test loop counter
bump rsi, -4*dist128+64 ;; Next source pointer
loops 2, b5b ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do FFT levels 5,6
;; Values 0-63 is real data, 64-127 is semi-real data, 128-4095 is
;; complex data.
;;
;; On input the 64-byte cache lines hold these data values:
;; 0 +1 64 +1 128 +1 192 +1
;; +2048 ...
;; 2 ...
;; ...
;; 62 ...
;; +2048 ...
;; 256 ...
;; ...
;; On output the 64-byte cache lines hold these data values:
;; 0 +1 16 +1 32 +1 48 +1
;; +2048 ...
;; 2 ...
;; ...
;; 14 ...
;; +2048 ...
;; 64 ...
;; ...
;; Do 8 eight_reals_fft_2 macros
;; Do 8 nop_two_two_complex_fft_2 macros
;; Do 16 four_complex_fft macros
;; distance between fft data elements is 16
;; do 8 macros each processing 32 data values
mov rdi, sincos8 ;; Load sin/cos pointer
loops_init 128
b6b: x4cl_eight_reals_fft_2 rsi, 2*64, 16*64, 32*64, rdi
loops 8, b6b ;; Test loop counter
;; Do 480 four_complex_fft macros
;; distance between fft data elements is 16
;; do 120 macros each processing 32 data values
bump rsi, -8*2*64+64*64 ;; Load source pointer
bump rdi, XMM_SCD ;; Next sine/cosine pointer
loops_reset ;; 2 iters of 4 iters of 1 or 2 of 8
b7b: x4cl_four_complex_fft rsi, 2*64, 16*64, 32*64, rdi
loops 8, b7b ;; Test inner loop counter
bump rsi, -8*2*64+64*64 ;; Next source pointer
bump rdi, 2*XMM_SCD ;; Next sine/cosine pointer
loops 2, b7b ;; Test middle loop counter
bump rsi, -2*64*64+dist128 ;; Next source pointer
loops 4, b7b ;; 4 iters
bump rsi, -4*dist128+64 ;; Next source pointer
loops 2, b7b ;; Test outer loop counter
bump rsi, -2*64 ;; Restore source pointer
;; Do FFT levels 7,8