-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathopa_issue.vhd
1006 lines (919 loc) · 50 KB
/
opa_issue.vhd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
-- opa: Open Processor Architecture
-- Copyright (C) 2014-2016 Wesley W. Terpstra
--
-- This program is free software: you can redistribute it and/or modify
-- it under the terms of the GNU General Public License as published by
-- the Free Software Foundation, either version 3 of the License, or
-- (at your option) any later version.
--
-- This program is distributed in the hope that it will be useful,
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- GNU General Public License for more details.
--
-- You should have received a copy of the GNU General Public License
-- along with this program. If not, see <http://www.gnu.org/licenses/>.
--
-- To apply the GPL to my VHDL, please follow these definitions:
-- Program - The entire collection of VHDL in this project and any
-- netlist or floorplan derived from it.
-- System Library - Any macro that translates directly to hardware
-- e.g. registers, IO pins, or memory blocks
--
-- My intent is that if you include OPA into your project, all of the HDL
-- and other design files that go into the same physical chip must also
-- be released under the GPL. If this does not cover your usage, then you
-- must consult me directly to receive the code under a different license.
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library work;
use work.opa_pkg.all;
use work.opa_isa_base_pkg.all;
use work.opa_functions_pkg.all;
use work.opa_components_pkg.all;
entity opa_issue is
generic(
g_isa : t_opa_isa;
g_config : t_opa_config;
g_target : t_opa_target);
port(
clk_i : in std_logic;
rst_n_i : in std_logic;
-- Values the renamer provides us
rename_stb_i : in std_logic;
rename_stall_o : out std_logic;
rename_fast_i : in std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_slow_i : in std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_order_i : in std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_geta_i : in std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_getb_i : in std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_aux_i : in std_logic_vector(f_opa_aux_wide(g_config)-1 downto 0);
rename_bakx_i : in t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
rename_baka_i : in t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
rename_bakb_i : in t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
rename_stata_i : in t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_stat_wide(g_config) -1 downto 0);
rename_statb_i : in t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_stat_wide(g_config) -1 downto 0);
rename_bakx_o : out t_opa_matrix(f_opa_renamers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
-- Exceptions from the EUs
eu_oldest_o : out std_logic_vector(f_opa_executers(g_config)-1 downto 0);
eu_retry_i : in std_logic_vector(f_opa_executers(g_config)-1 downto 0);
eu_fault_i : in std_logic_vector(f_opa_executers(g_config)-1 downto 0);
eu_pc_i : in t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_adr_wide(g_config)-1 downto f_opa_op_align(g_isa));
eu_pcf_i : in t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_fet_wide(g_config)-1 downto 0);
eu_pcn_i : in t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_adr_wide(g_config)-1 downto f_opa_op_align(g_isa));
-- Selected fault fed back up pipeline
rename_fault_o : out std_logic;
rename_mask_o : out std_logic_vector(f_opa_renamers(g_config)-1 downto 0);
rename_pc_o : out std_logic_vector(f_opa_adr_wide(g_config)-1 downto f_opa_op_align(g_isa));
rename_pcf_o : out std_logic_vector(f_opa_fet_wide(g_config)-1 downto 0);
rename_pcn_o : out std_logic_vector(f_opa_adr_wide(g_config)-1 downto f_opa_op_align(g_isa));
-- Regfile needs to fetch these for EU
regfile_rstb_o : out std_logic_vector(f_opa_executers(g_config)-1 downto 0);
regfile_geta_o : out std_logic_vector(f_opa_executers(g_config)-1 downto 0);
regfile_getb_o : out std_logic_vector(f_opa_executers(g_config)-1 downto 0);
regfile_aux_o : out t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_aux_wide (g_config)-1 downto 0);
regfile_dec_o : out t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_ren_wide (g_config)-1 downto 0);
regfile_baka_o : out t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
regfile_bakb_o : out t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
-- Regfile should capture result from EU
regfile_wstb_o : out std_logic_vector(f_opa_executers(g_config)-1 downto 0);
regfile_bakx_o : out t_opa_matrix(f_opa_executers(g_config)-1 downto 0, f_opa_back_wide(g_isa,g_config)-1 downto 0);
-- Gather information from L1d about aliased loads
l1d_store_i : in std_logic;
l1d_load_i : in std_logic_vector(f_opa_num_slow(g_config)-1 downto 0);
l1d_addr_i : in t_opa_matrix(f_opa_num_slow(g_config)-1 downto 0, f_opa_alias_high(g_isa) downto f_opa_alias_low(g_config));
l1d_mask_i : in t_opa_matrix(f_opa_num_slow(g_config)-1 downto 0, f_opa_reg_wide(g_config)/8-1 downto 0));
end opa_issue;
architecture rtl of opa_issue is
constant c_op_align : natural := f_opa_op_align (g_isa);
constant c_num_arch : natural := f_opa_num_arch (g_isa);
constant c_num_stat : natural := f_opa_num_stat (g_config);
constant c_num_fast : natural := f_opa_num_fast (g_config);
constant c_num_slow : natural := f_opa_num_slow (g_config);
constant c_back_wide : natural := f_opa_back_wide(g_isa,g_config);
constant c_aux_wide : natural := f_opa_aux_wide (g_config);
constant c_ren_wide : natural := f_opa_ren_wide (g_config);
constant c_stat_wide : natural := f_opa_stat_wide(g_config);
constant c_adr_wide : natural := f_opa_adr_wide (g_config);
constant c_alias_low : natural := f_opa_alias_low(g_config);
constant c_alias_high: natural := f_opa_alias_high(g_isa);
constant c_reg_bytes : natural := f_opa_reg_wide (g_config)/8;
constant c_fet_wide : natural := f_opa_fet_wide (g_config);
constant c_renamers : natural := f_opa_renamers (g_config);
constant c_executers : natural := f_opa_executers(g_config);
constant c_fast0 : natural := f_opa_fast_index(g_config, 0);
constant c_slow0 : natural := f_opa_slow_index(g_config, 0);
constant c_mux_share : natural := 2;
constant c_stat_ones : std_logic_vector(c_num_stat -1 downto 0) := (others => '1');
constant c_fast_zeros : std_logic_vector(c_num_fast -1 downto 0) := (others => '0');
constant c_slow_ones : std_logic_vector(c_num_slow -1 downto 0) := (others => '1');
constant c_slow_only : std_logic_vector(c_executers-1 downto 0) := c_slow_ones & c_fast_zeros;
constant c_executer_ones : std_logic_vector(c_executers-1 downto 0) := (others => '1');
constant c_init_bak : t_opa_matrix := f_opa_labels(c_num_stat, c_back_wide, c_num_arch);
-- OPA makes heavy use of speculative execution; instructions run opportunistically.
-- It can make these kinds of mistakes, detected during execution:
-- A. A branch/jump detects the next instruction is wrong (misprediction)
-- B. A load needed data not yet in the cache (cache miss)
-- C. A load/store accessed unmapped memory (page fault)
-- D. A load did not see the result of an older store (RaW hazard)
--
-- The OPA processor execution stage only has these public effects:
-- 1. A store to L1/memory
-- 2. A load from uncacheable memory
-- 3. Reporting a mispredicted branch/jump
-- ... none of these effects can be reversed in OPA, so they are not speculated.
--
-- To ensure in-order execution, instructions with public effects are only
-- executed when they are the oldest remaining instruction. This has the consequence
-- of limiting OPA to one store / bus operation at a time. In the future, it might
-- be possible to support multiple concurrent stores to L1 cache if the stores are
-- all the oldest remaining instructions, but this requires more write ports on L1
-- and I chose not to implement this in order to keep L1 reasonably cost effective.
--
-- Instructions in the window (c_num_stat) have these flags:
-- issued: already sent to the execution units
-- ready: result is available for dependants
-- final: no mistakes detected during execution
-- complete: instruction and all priors are final
--
-- An instruction may only be issued if both its inputs are ready.
-- Slow instructions transition ready to high 2 cycles after issued goes high.
-- Fast instructions transition issued and ready to high at the same time.
-- Both types transition final to high 4 cycles after issued goes high.
--
-- The oldest instruction is at index 0. Newer instructions have larger indexes.
-- Only complete instructions are shifted out of the window, updating the commit map.
-- As mentioned above, only the oldest incomplete instruction may cause public effects.
--
-- Incomplete instructions can have their issue+ready+final flags removed.
-- This can happen in three ways:
-- 1. One of the inputs of the instruction was retried (ready went to 0)
-- => on the following cycle issued+ready+final of the dependent instruction go to 0
-- 2. A store causes following loads to be retried (they alias)
-- => on the cycle the store goes final=1, the loads have issued+ready+final=0
-- 3. An instruction must be retried, because after 4 cycles it asked for retry
-- => on the cycle final WOULD have gone to 1, instead issued+ready go to 0
--
-- There is a slight wrinkle in this plan: inflight instructions (ie: issued+!final)
-- When these have issued reset to 0, the running execution must not set ready/final.
-- Consider the three causes of a retry:
-- 1. If an input went unready, this is the easiest case.
-- The input precedes our instruction, so we won't be shifted out till after it.
-- It will be at least 4 cycles before that happens.
-- However, we must still ensure that our instruction does not prematurely
-- set ready/final=1 due to an inflight execution.
-- For a fast instruction, it is impossible for ready to go high if deps are unready.
-- A slow instruction must also block ready for this cycle.
-- Both must wipe the schedule so that final (and ready for slow) won't come later.
-- 2. We must ensure that retried loads are not shifted out!
-- The store final=1 and loads final=0 must be atomic.
-- 3. An instruction that does not go final has two sub-cases
-- a. The instruction was retried: this failed execution is ignored
-- b. The instruction was not retried: set issued+ready to 0
--
-- Here is how we handle mistakes A-D.
-- A. A branch/jump that detects the wrong next instruction will:
-- if it is preceded by non-final instructions, refuse to go final, causing its own reissue
-- otherwise, report the fault, causing an irreversible update to the branch predictor
-- B. A load that cannot find its data in cache will:
-- request the dbus to refill a selected line (this request can be ignored)
-- not go final, causing its own reissue later (dbus has 5 cycles to refill L1)
-- C. A page fault is handled like a mispredicted branch/jump; reissue until oldest, then fault
-- D. When a store executes, any loads with a matching address are reissued
-- For now, I consider addresses to alias if they have the same word offset in a page
--
-- To avoid wasteful reissue, we only issue stores once all priors are issued, making
-- it very likely that they execute only when all priors are final. Unfortunately,
-- without a dedicated 'ioload' instruction, we cannot likewise delay load issue.
--
-- To keep r_schedule0 as easy to compute as possible, half of the reservation station
-- is shifted early, and half is shifted late. r_schedule0 is late, as is anything fed
-- to the regfile stage. Anything used to feed r_schedule0 is shifted early.
-- These have 1 latency indexes
signal s_schedule_fast : t_opa_matrix(c_num_fast-1 downto 0, c_num_stat-1 downto 0);
signal s_schedule_slow : t_opa_matrix(c_num_slow-1 downto 0, c_num_stat-1 downto 0);
signal r_schedule0 : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0) := (others => (others => '0'));
signal r_schedule1s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0) := (others => (others => '0'));
signal r_schedule2 : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0) := (others => (others => '0'));
signal r_schedule3s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0) := (others => (others => '0'));
signal r_schedule4s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0) := (others => (others => '0'));
signal s_schedule_wb : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
-- These have 0 latency indexes (fed directly)
signal r_fast : std_logic_vector(c_num_stat-1 downto 0) := (others => '0');
signal r_slow : std_logic_vector(c_num_stat-1 downto 0) := (others => '0');
signal s_issued : std_logic_vector(c_num_stat-1 downto 0);
signal s_new_issued : std_logic_vector(c_num_stat-1 downto 0);
signal r_issued : std_logic_vector(c_num_stat-1 downto 0) := (others => '1');
signal s_final : std_logic_vector(c_num_stat-1 downto 0);
signal s_new_final : std_logic_vector(c_num_stat-1 downto 0);
signal r_final : std_logic_vector(c_num_stat-1 downto 0) := (others => '1');
-- These have 0 latency indexes, but 1 latency content
signal s_stata : t_opa_matrix(c_num_stat-1 downto 0, c_stat_wide-1 downto 0);
signal r_stata : t_opa_matrix(c_num_stat-1 downto 0, c_stat_wide-1 downto 0) := (others => (others => '1'));
signal s_statb : t_opa_matrix(c_num_stat-1 downto 0, c_stat_wide-1 downto 0);
signal r_statb : t_opa_matrix(c_num_stat-1 downto 0, c_stat_wide-1 downto 0) := (others => (others => '1'));
-- These have 1 latency indexes (fed by skidpad)
signal s_ready : std_logic_vector(c_num_stat-1 downto 0);
signal s_ready_slow : std_logic_vector(c_num_stat-1 downto 0);
signal s_new_ready : std_logic_vector(c_num_stat-1 downto 0);
signal r_ready : std_logic_vector(c_num_stat-1 downto 0) := (others => '1');
signal r_geta : std_logic_vector(c_num_stat-1 downto 0);
signal r_getb : std_logic_vector(c_num_stat-1 downto 0);
signal r_aux : t_opa_matrix(c_num_stat-1 downto 0, c_aux_wide -1 downto 0);
signal r_bakx : t_opa_matrix(c_num_stat-1 downto 0, c_back_wide-1 downto 0) := c_init_bak;
signal r_baka : t_opa_matrix(c_num_stat-1 downto 0, c_back_wide-1 downto 0);
signal r_bakb : t_opa_matrix(c_num_stat-1 downto 0, c_back_wide-1 downto 0);
-- Calculation of what to issue
type t_ready_pad is array(natural range <>) of std_logic_vector(2**c_stat_wide-1 downto 0);
signal s_ready_pads : t_ready_pad((c_num_stat+c_mux_share-1)/c_mux_share-1 downto 0);
signal s_ready_pad : std_logic_vector(2**c_stat_wide-1 downto 0) := (others => '0');
signal s_readya : std_logic_vector(c_num_stat-1 downto 0);
signal s_readyb : std_logic_vector(c_num_stat-1 downto 0);
signal s_readyab : std_logic_vector(c_num_stat-1 downto 0);
signal s_pending_fast : std_logic_vector(c_num_stat-1 downto 0);
signal s_pending_slow : std_logic_vector(c_num_stat-1 downto 0);
signal s_fast_issue : std_logic_vector(c_num_stat-1 downto 0);
signal r_fast_issue : std_logic_vector(c_num_stat-1 downto 0);
signal s_slow_issue : std_logic_vector(c_num_stat-1 downto 0);
signal r_slow_issue : std_logic_vector(c_num_stat-1 downto 0);
-- The three sources of reissue
signal s_nodep : std_logic_vector(c_num_stat-1 downto 0);
signal r_alias : std_logic_vector(c_num_stat-1 downto 0) := (others => '0');
signal s_retry : std_logic_vector(c_num_stat-1 downto 0);
signal r_retry : std_logic_vector(c_executers-1 downto 0) := (others => '0');
signal s_finalize : std_logic_vector(c_executers-1 downto 0);
signal r_wipe : std_logic_vector(c_num_stat-1 downto 0) := (others => '0');
signal s_wipe : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
-- Load buffer CAM
signal s_slow_schedule3s : t_opa_matrix(c_num_stat-1 downto 0, c_num_slow-1 downto 0);
signal s_store_schedule3s: std_logic_vector(c_num_stat-1 downto 0);
signal s_after_store : std_logic_vector(c_num_stat-1 downto 0);
signal s_alias_write : std_logic_vector(c_num_stat-1 downto 0);
signal s_alias_valid : std_logic_vector(c_num_stat-1 downto 0);
signal r_alias_valid : std_logic_vector(c_num_stat-1 downto 0) := (others => '0');
signal s_alias_addr_new : t_opa_matrix(c_num_stat-1 downto 0, c_alias_high downto c_alias_low);
signal s_alias_addr : t_opa_matrix(c_num_stat-1 downto 0, c_alias_high downto c_alias_low);
signal r_alias_addr : t_opa_matrix(c_num_stat-1 downto 0, c_alias_high downto c_alias_low);
signal s_alias_mask_new : t_opa_matrix(c_num_stat-1 downto 0, c_reg_bytes-1 downto 0);
signal s_alias_mask : t_opa_matrix(c_num_stat-1 downto 0, c_reg_bytes-1 downto 0);
signal r_alias_mask : t_opa_matrix(c_num_stat-1 downto 0, c_reg_bytes-1 downto 0);
signal s_alias : std_logic_vector(c_num_stat-1 downto 0);
-- Determine if side effects are allowed
signal s_future_final : std_logic_vector(c_num_stat-1 downto 0);
signal s_future_complete : std_logic_vector(c_num_stat-1 downto 0);
signal s_future_pcomplete: std_logic_vector(c_num_stat-1 downto 0);
signal s_old : std_logic_vector(c_executers-1 downto 0);
signal r_old : std_logic_vector(c_executers-1 downto 0) := (others => '0');
signal s_oldest_candidate: std_logic_vector(c_executers-1 downto 0);
signal r_oldest_candidate: std_logic_vector(c_executers-1 downto 0) := (others => '0');
signal s_oldest_possible : std_logic;
signal s_am_oldest : std_logic_vector(c_executers-1 downto 0) := (others => '0');
-- Flow control of the issue pipeline
signal s_stall : std_logic;
signal s_shift : std_logic;
signal r_shift : std_logic := '0';
-- Accept data from the renamer; use a skidpad to synchronize state
signal r_sp_geta : std_logic_vector(c_renamers-1 downto 0);
signal r_sp_getb : std_logic_vector(c_renamers-1 downto 0);
signal r_sp_bakx : t_opa_matrix(c_renamers-1 downto 0, c_back_wide-1 downto 0);
signal r_sp_baka : t_opa_matrix(c_renamers-1 downto 0, c_back_wide-1 downto 0);
signal r_sp_bakb : t_opa_matrix(c_renamers-1 downto 0, c_back_wide-1 downto 0);
signal r_sp_aux : t_opa_matrix(c_renamers-1 downto 0, c_aux_wide -1 downto 0);
-- Faults are resolved to the oldest and executed when all preceding ops are final
signal r_fault_in : std_logic_vector(c_executers-1 downto 0) := (others => '0');
signal s_fault_pending : std_logic;
signal r_fault_pending : std_logic := '0';
signal s_fault_out : std_logic;
signal r_fault_out : std_logic := '0'; -- lasts one cycle
signal r_fault_out1 : std_logic := '0'; -- one cycle delayed
signal r_fault_pipe : std_logic := '0'; -- lasts two cycles
signal r_fault_mask : std_logic_vector(c_renamers-1 downto 0);
signal r_fault_pc : std_logic_vector(c_adr_wide-1 downto c_op_align);
signal r_fault_pcf : std_logic_vector(c_fet_wide-1 downto 0);
signal r_fault_pcn : std_logic_vector(c_adr_wide-1 downto c_op_align);
signal r_fault_slow_pc : std_logic_vector(c_adr_wide-1 downto c_op_align);
signal r_fault_slow_pcf: std_logic_vector(c_fet_wide-1 downto 0);
signal r_fault_slow_pcn: std_logic_vector(c_adr_wide-1 downto c_op_align);
signal r_fault_fast_pc : std_logic_vector(c_adr_wide-1 downto c_op_align);
signal r_fault_fast_pcf: std_logic_vector(c_fet_wide-1 downto 0);
signal r_fault_fast_pcn: std_logic_vector(c_adr_wide-1 downto c_op_align);
function f_decoder_labels(renamers : natural) return t_opa_matrix is
variable result : t_opa_matrix(c_num_stat-1 downto 0, c_ren_wide-1 downto 0);
variable value : unsigned(result'range(2));
begin
for s in result'range(1) loop
value := to_unsigned(s mod c_renamers, value'length);
for b in value'range loop
result(s,b) := value(b);
end loop;
end loop;
return result;
end f_decoder_labels;
constant c_decoder_labels : t_opa_matrix := f_decoder_labels(c_renamers);
function f_shift(x : std_logic_vector; s : std_logic; fill : std_logic := '0') return std_logic_vector is
alias y : std_logic_vector(x'high downto x'low) is x;
variable result : std_logic_vector(y'range) := y;
begin
if s = '1' then
result := (others => fill);
result(y'high-c_renamers downto y'low) := y(y'high downto y'low+c_renamers);
end if;
return result;
end f_shift;
function f_shift(x : t_opa_matrix; s : std_logic) return t_opa_matrix is
variable result : t_opa_matrix(x'range(1), x'range(2)) := x;
begin
if s = '1' then
result := (others => (others => '0'));
for i in x'range(1) loop
for j in x'high(2)-c_renamers downto x'low(2) loop
result(i,j) := x(i,j+c_renamers);
end loop;
end loop;
end if;
return result;
end f_shift;
begin
check : process(clk_i) is
begin
if rising_edge(clk_i) then
-- input control signals
assert (f_opa_safe(rename_stb_i) = '1') report "issue: rename_stb_i has metavalue" severity failure;
assert (f_opa_safe(l1d_store_i) = '1') report "issue: l1d_store_i has metavalue" severity failure;
assert (f_opa_safe(l1d_load_i) = '1') report "issue: l1d_load_i has metavalue" severity failure;
-- internal control signals
assert (f_opa_safe(r_shift) = '1') report "issue: r_shift has metavalue" severity failure;
assert (f_opa_safe(s_shift) = '1') report "issue: s_shift has metavalue" severity failure;
assert (f_opa_safe(r_fault_pipe) = '1') report "issue: r_fault_pipe has metavalue" severity failure;
assert (f_opa_safe(r_fault_out) = '1') report "issue: r_fault_out has metavalue" severity failure;
assert (f_opa_safe(s_fault_out) = '1') report "issue: s_fault_out has metavalue" severity failure;
assert (f_opa_safe(r_fault_in) = '1') report "issue: r_fault_in has metavalue" severity failure;
for i in 0 to c_num_stat-1 loop
assert (f_opa_safe(f_opa_select_row(r_stata,i)) = '1') report "issue: stata bad" severity warning;
assert (f_opa_safe(f_opa_select_row(r_statb,i)) = '1') report "issue: statb bad" severity warning;
end loop;
end if;
end process;
invariants : process(clk_i) is
variable v_schedule0s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
variable v_schedule1s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
variable v_schedule2s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
variable v_schedule3s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
variable v_schedule4s : t_opa_matrix(c_executers-1 downto 0, c_num_stat-1 downto 0);
variable v_schedule2p : std_logic_vector(c_num_stat-1 downto 0);
variable v_seen : std_logic_vector(c_num_stat-1 downto 0);
variable v_old : std_logic_vector(c_num_stat-1 downto 0);
variable v_complete : std_logic_vector(c_num_stat-1 downto 0);
begin
if rising_edge(clk_i) then
-- r_final => r_ready (s_ready b/c r_ready has indexes one cycle late)
assert (f_opa_or(r_final and not s_ready) = '0')
report "issue: final operation that is not ready!"
severity failure;
-- r_ready => r_issued (s_issued b/c issued is actually the union of three vectors)
assert (f_opa_or(s_ready and not s_issued) = '0')
report "issue: ready operation that is not issued!"
severity failure;
-- r_issued => r_ready for fast ops
assert (f_opa_or(s_issued and r_fast and not s_ready) = '0')
report "issue: issued fast operation is not ready!"
severity failure;
-- If the CAM has an entry, it better be a store
assert (f_opa_or(r_alias_valid and not r_slow) = '0')
report "issue: load alias for non-slow op!"
severity failure;
-- r_alias => r_final (we only wake up final ops)
assert (f_opa_or(r_alias and not r_final) = '0')
report "issue: load alias for non-final op!"
severity failure;
-- Start checking the schedule
v_schedule0s := not f_opa_dup_row(c_executers, r_wipe) and f_shift(r_schedule0, r_shift);
v_schedule1s := not f_opa_dup_row(c_executers, r_wipe) and r_schedule1s;
v_schedule2s := not f_opa_dup_row(c_executers, r_wipe) and f_shift(r_schedule2, r_shift);
v_schedule3s := not f_opa_dup_row(c_executers, r_wipe) and r_schedule3s;
v_schedule4s := not f_opa_dup_row(c_executers, r_wipe) and r_schedule4s;
-- r_schedule2+ => r_ready for all ops
v_schedule2p := f_opa_product(f_opa_transpose(v_schedule2s or v_schedule3s or v_schedule4s), c_executer_ones);
assert (f_opa_or(v_schedule2p and not s_ready) = '0')
report "issue: scheduled op older than 2 cycles is not ready!"
severity failure;
-- An instruction can only be in-flight for one EU at one offset at a time
v_seen := (others => '0');
for u in 0 to c_executers-1 loop
for s in 0 to c_num_stat-1 loop
assert (v_seen(s) = '0' or v_schedule0s(u,s) = '0') report "issue: double-scheduled operation" severity failure;
v_seen(s) := v_seen(s) or v_schedule0s(u,s);
assert (v_seen(s) = '0' or v_schedule1s(u,s) = '0') report "issue: double-scheduled operation" severity failure;
v_seen(s) := v_seen(s) or v_schedule1s(u,s);
assert (v_seen(s) = '0' or v_schedule2s(u,s) = '0') report "issue: double-scheduled operation" severity failure;
v_seen(s) := v_seen(s) or v_schedule2s(u,s);
assert (v_seen(s) = '0' or v_schedule3s(u,s) = '0') report "issue: double-scheduled operation" severity failure;
v_seen(s) := v_seen(s) or v_schedule3s(u,s);
assert (v_seen(s) = '0' or v_schedule4s(u,s) = '0') report "issue: double-scheduled operation" severity failure;
v_seen(s) := v_seen(s) or v_schedule4s(u,s);
end loop;
end loop;
-- If it's scheduled, it better be issued!
assert (f_opa_or(v_seen and not s_issued) = '0')
report "issue: scheduled operation is not issued!"
severity failure;
-- If it's issued, it better be scheduled or final!
assert (f_opa_or(s_issued and not v_seen and not r_final) = '0')
report "issue: issued operation is not scheduled!"
severity failure;
-- If it's scheduled, it better not be final!
assert (f_opa_or(v_seen and r_final) = '0')
report "issue: scheduled operation is final!"
severity failure;
-- Confirm r_old makes sense
for u in 0 to c_executers-1 loop
assert (r_old(u) = '0' or f_opa_or(f_opa_select_row(v_schedule4s, u)) = '1')
report "issue: unscheduled old instruction"
severity failure;
end loop;
-- Find the instructions we claim are old
v_old := f_opa_product(f_opa_transpose(v_schedule4s), r_old);
assert (f_opa_or(v_old and r_final) = '0')
report "issue: an old instruction cannot be final!"
severity failure;
-- r_old combined with r_final must form unbroken 1s including r_old
v_complete := v_old or r_final;
v_complete := v_complete and not std_logic_vector(unsigned(v_complete) + 1);
assert (f_opa_or(v_old and not v_complete) = '0')
report "issue: an old instruction did not form a part of the new complete chain"
severity failure;
-- The current 'oldest' instruction had better be correct
assert (s_am_oldest(c_fast0) = '0' or s_old(c_fast0) = '1')
report "issue: fast0 s_am_oldest, but not old?!"
severity failure;
assert (s_am_oldest(c_slow0) = '0' or s_old(c_slow0) = '1')
report "issue: fast0 s_am_oldest, but not old?!"
severity failure;
assert (s_am_oldest(c_fast0) = '0' or s_am_oldest(c_slow0) = '0')
report "issue: there can not be two oldest"
severity failure;
end if;
end process;
-- Which stations are already issued?
s_issued <= f_shift(r_fast_issue or r_slow_issue, r_shift) or r_issued;
-- Which stations have ready operands?
--
-- What follows is a fancy way of doing this:
-- s_ready_pad(s_ready_pad'high) <= '1';
-- s_ready_pad(r_ready'range) <= r_ready;
-- s_readya <= f_opa_compose(s_ready_pad, r_stata);
-- s_readyb <= f_opa_compose(s_ready_pad, r_statb);
--
-- We know that r_stat[ab] never refer backwards, so there are wasted cases
-- in the mux. The ready muxes are the largest component in the critical path,
-- so this seeming small optimization does matter.
--
-- For this calculation, insert '-'s for backward references (impossible) to save area.
s_ready_pad(r_ready'range) <= r_ready; -- pad with 0s
pads : for i in 0 to ((c_num_stat+c_mux_share-1)/c_mux_share)-1 generate
s_ready_pads(i)(2**c_stat_wide-1) <= '1'; -- no-stat-dep means always ready
s_ready_pads(i)((i+1)*c_mux_share+c_renamers-2 downto 0) <= s_ready_pad((i+1)*c_mux_share+c_renamers-2 downto 0);
gap : if 2**c_stat_wide-2 >= (i+1)*c_mux_share+c_renamers-1 generate
s_ready_pads(i)(2**c_stat_wide-2 downto (i+1)*c_mux_share+c_renamers-1) <= (others => '-');
end generate;
end generate;
compose : for i in 0 to c_num_stat-1 generate
s_readya(i) <= s_ready_pads(i / c_mux_share)(to_integer(unsigned(f_opa_select_row(r_stata, i))));
s_readyb(i) <= s_ready_pads(i / c_mux_share)(to_integer(unsigned(f_opa_select_row(r_statb, i))));
end generate;
-- Which stations are pending issue?
s_readyab <= s_readya and s_readyb; -- 3 levels (for stat_wide <= 5)
s_pending_fast <= s_readyab and not s_issued and r_fast;
s_pending_slow <= s_readyab and not s_issued and r_slow;
-- Derive the schedule from the pending instructions
fast : opa_prefixsum
generic map(
g_target => g_target,
g_width => c_num_stat,
g_count => c_num_fast)
port map(
bits_i => s_pending_fast,
count_o => s_schedule_fast,
total_o => s_fast_issue);
slow : opa_prefixsum
generic map(
g_target => g_target,
g_width => c_num_stat,
g_count => c_num_slow)
port map(
bits_i => s_pending_slow,
count_o => s_schedule_slow,
total_o => s_slow_issue);
-- Report our scheduling decision to the regfile
-- r_bak[abx], r_aux shifted one cycle later, so s_stat has correct index
regfile_rstb_o <= f_opa_product(r_schedule0, c_stat_ones);
regfile_geta_o <= f_opa_product(r_schedule0, r_geta);
regfile_getb_o <= f_opa_product(r_schedule0, r_getb);
regfile_baka_o <= f_opa_product(r_schedule0, r_baka);
regfile_bakb_o <= f_opa_product(r_schedule0, r_bakb);
regfile_aux_o <= f_opa_product(r_schedule0, r_aux);
regfile_dec_o <= f_opa_product(r_schedule0, c_decoder_labels);
-- 2 levels with stations <= 18
-- Report our writeback schedule to the regfile
wb_sched : for j in 0 to c_num_stat-1 generate
fast : for i in 0 to c_num_fast-1 generate
s_schedule_wb(i,j) <= r_schedule0(i,j);
end generate;
slow : for i in c_num_fast to c_executers-1 generate
s_schedule_wb(i,j) <= r_schedule2(i,j);
end generate;
end generate;
regfile_wstb_o <= f_opa_product(s_schedule_wb, c_stat_ones);
regfile_bakx_o <= f_opa_product(s_schedule_wb, r_bakx);
-- All the reasons we might have to reissue instructions
s_nodep <= not s_readyab;
-- r_alias
s_retry <= f_opa_product(f_opa_transpose(r_schedule4s), r_retry) and not r_wipe; -- EU wants to re-run
-- issued must go low in all three cases
s_new_issued <= s_issued and not (s_nodep or r_alias or s_retry);
-- ready must go low in all three cases, however there are three sources of readiness
-- 1. old readiness / instructions where r_ready=1 already
-- => these must be masked out by all three cases
-- 2. slow readiness / slow instructions issued 2 cycles ago
-- => s_nodep
-- => s_alias is impossible => it implies the op was final
-- => s_retry is impossible => it implies simultaneous execution
-- 3. fast readiness / fast instructions issued just now
-- => s_nodep is already considered via s_pending_fast
-- => s_alias does not apply to fast instructions, only slow ones (loads)
-- => s_retry is impossible => it implies simultaneous execution
s_ready <= f_shift(r_ready, r_shift, r_fault_out1);
s_ready_slow <=
not (s_nodep or r_alias or s_retry) and
(s_ready or (f_opa_product(f_opa_transpose(r_schedule1s), c_slow_only) and not r_wipe));
s_new_ready <= (s_fast_issue and s_pending_fast) or s_ready_slow;
-- final must go low in all three cases, however not all must be considered for completeness/shift
-- => s_nodep is irrevelant to completeness, because the (older) input already blocks complete
-- => s_retry is actually the opposite here; we only go final if its false
-- => r_alias must be considered, in order for store final=1 to be atomic with load final=0
-- however, r_alias cannot affect anything scheduled, as they are not final
s_finalize <= not r_retry;
s_final <= (r_final and not r_alias) or (f_opa_product(f_opa_transpose(r_schedule4s), s_finalize) and not r_wipe);
s_new_final <= s_final and not s_nodep;
-- Determine if the execution window should be shifted
s_stall <= not f_opa_and(s_final(c_renamers-1 downto 0));
s_shift <= (rename_stb_i and not s_stall) or r_fault_out;
rename_stall_o <= s_stall;
-- Plan oldest calculation one cycle ahead:
-- Recall that complete = this and all later instructions are final.
-- Nothing can undo complete instructions; alias/nodep affect younger instructions and retry scheduled
-- Define old = will be complete if all the instructions at schedule4 go final
-- Old instructions can be prevented from becoming complete if they or an older old instruction
-- report retry. Only old instructions matter. Suppose none of them retry. Then:
-- 1. s_nodep can't stop them; older are all final => ready
-- 2. r_alias can't stop them; the only younger instructions are also old, which means that they
-- ran in the same cycle. same cycle aliases are resolved with a retry.
-- Therefore, the only thing that prevents an old instruction from going final is retry.
--
-- Now, suppose we are an instruction that runs in the cycle after the current old instructions.
-- The question is: will we be the oldest in that cycle?
-- Well, to answer this, suppose first that all the old instructions succeed (s_future_final).
-- If our precedecessor is then complete (s_future_complete), we must be oldest!
--
-- Obviously, not all the instructions last cycle necessarily complete. However, we only have
-- to care about the old instructions, because only those can preceed us if we become oldest.
-- As argued above, those old instructions can only be stopped by a retry. So we just need to
-- know two things: which executed instructions are old (r_old), and did they retry?
--
-- If an instruction is old, then it's predecessor this cycle must be complete, so we can just
-- re-use s_future_complete to check. We record this into r_was_old to combine with r_retry.
-- We don't need to consider s_nodep in s_future_final, because nodep => an earlier non-final,
-- and we don't care about future_final per-se, but about future_complete.
s_future_final <= s_final or (f_opa_product(f_opa_transpose(r_schedule3s), c_executer_ones) and not r_wipe);
s_future_complete <= s_future_final and not std_logic_vector(unsigned(s_future_final) + 1);
s_future_pcomplete <= not r_wipe and (s_future_complete(s_future_complete'high-1 downto s_future_complete'low) & '1');
-- Next cycle, these instructions are old
s_old <= f_opa_product(r_schedule3s, s_future_pcomplete);
-- In two cycles, these instructions can become oldest IF the next cycle's old instructions complete
s_oldest_candidate <= f_opa_product(f_shift(r_schedule2, r_shift), s_future_pcomplete);
-- Let's prove a bit more in-depth that the above cannot go wrong
-- 1. alias. Not an issue, because these ops are scheduled => !final => !r_alias
-- 2. retry. This is explicitly covered for schedule4 by using s_final and schedule3 using r_retry
-- 3. nodep. If schedule4 fail, then s_future_complete will be zero beyond that point
-- If schedule3 fail, that is covered by considering their r_retry
-- schedule2 cannot have cross-depends
-- No other instructions are earlier, if we are an oldest candidate.
s_oldest_possible <= f_opa_and(not r_old or s_finalize);
s_am_oldest(c_fast0) <= s_oldest_possible and r_oldest_candidate(c_fast0);
s_am_oldest(c_slow0) <= s_oldest_possible and r_oldest_candidate(c_slow0);
eu_oldest_o <= s_am_oldest;
-- Forward the fault up the pipeline
rename_fault_o <= r_fault_out;
rename_mask_o <= r_fault_mask;
rename_pc_o <= r_fault_pc;
rename_pcf_o <= r_fault_pcf;
rename_pcn_o <= r_fault_pcn;
-- faults always come with an s_shift
-- We can use r_final instead of s_final/s_stall because a fault only happens if it was last
s_fault_pending <= r_fault_in(c_fast0) or r_fault_in(c_slow0);
s_fault_out <= (s_fault_pending or r_fault_pending) and
not f_opa_and(r_final(c_renamers-1 downto 0));
fault_ctl : process(clk_i, rst_n_i) is
begin
if rst_n_i = '0' then
r_fault_in <= (others => '0');
r_fault_pending <= '0';
r_fault_out <= '0';
r_fault_out1 <= '0';
r_fault_pipe <= '0';
elsif rising_edge(clk_i) then
r_fault_out1 <= r_fault_out;
r_fault_pipe <= s_fault_out or r_fault_out;
if r_fault_out = '1' then
r_fault_in <= (others => '0');
r_fault_pending <= '0';
r_fault_out <= '0';
else
r_fault_in <= eu_fault_i;
r_fault_pending <= r_fault_pending or s_fault_pending;
r_fault_out <= s_fault_out;
end if;
end if;
end process;
fault_adr : process(clk_i) is
begin
if rising_edge(clk_i) then
if s_fault_out = '1' then
r_fault_mask <= s_final(c_renamers-1 downto 0) and
not std_logic_vector(unsigned(s_final(c_renamers-1 downto 0)) + 1);
else
r_fault_mask <= (others => '1');
end if;
r_fault_fast_pc <= f_opa_select_row(eu_pc_i, c_fast0);
r_fault_fast_pcf <= f_opa_select_row(eu_pcf_i, c_fast0);
r_fault_fast_pcn <= f_opa_select_row(eu_pcn_i, c_fast0);
r_fault_slow_pc <= f_opa_select_row(eu_pc_i, c_slow0);
r_fault_slow_pcf <= f_opa_select_row(eu_pcf_i, c_slow0);
r_fault_slow_pcn <= f_opa_select_row(eu_pcn_i, c_slow0);
r_fault_pc <= r_fault_pc;
r_fault_pcf <= r_fault_pcf;
r_fault_pcn <= r_fault_pcn;
-- These two cases are actually mutually exclusive, but whatever.
if r_fault_in(c_fast0) = '1' then
r_fault_pc <= r_fault_fast_pc;
r_fault_pcf <= r_fault_fast_pcf;
r_fault_pcn <= r_fault_fast_pcn;
end if;
if r_fault_in(c_slow0) = '1' then
r_fault_pc <= r_fault_slow_pc;
r_fault_pcf <= r_fault_slow_pcf;
r_fault_pcn <= r_fault_slow_pcn;
end if;
end if;
end process;
-- Extract the slow unit schedule
slow_sched3 : for j in 0 to c_num_stat-1 generate
ldst : for i in 0 to c_num_slow-1 generate
s_slow_schedule3s(j,i) <= r_schedule3s(i+c_num_fast,j);
end generate;
s_store_schedule3s(j) <= r_schedule3s(c_num_fast,j);
end generate;
-- Which operations come AFTER the store?
s_after_store <= std_logic_vector(unsigned(not s_store_schedule3s) + 1);
-- Add new loads to the CAM
s_alias_write <= f_opa_product(s_slow_schedule3s, l1d_load_i) and not r_wipe;
s_alias_valid <= s_alias_write or r_alias_valid;
s_alias_addr_new <= f_opa_product(s_slow_schedule3s, l1d_addr_i);
s_alias_mask_new <= f_opa_product(s_slow_schedule3s, l1d_mask_i);
s_alias_addr <= f_opa_mux(s_alias_write, s_alias_addr_new, r_alias_addr);
s_alias_mask <= f_opa_mux(s_alias_write, s_alias_mask_new, r_alias_mask);
-- Process the load alias CAM
-- Note: l1d_store_i might strobe even though it was wiped.
-- This is harmless. If it was wiped, it can't be final or complete.
-- Therefore, anything we restart (which is newer) is safe to reissue.
alias_check : for s in 0 to c_num_stat-1 generate
s_alias(s) <= l1d_store_i and r_alias_valid(s) and s_after_store(s)
and f_opa_eq(f_opa_select_row(r_alias_addr, s), f_opa_select_row(l1d_addr_i, 0))
and f_opa_or(f_opa_select_row(r_alias_mask, s) and f_opa_select_row(l1d_mask_i, 0));
end generate;
-- Prepare decremented versions of the station references
s_stata <= f_opa_decrement(r_stata, c_renamers) when r_shift='1' else r_stata;
s_statb <= f_opa_decrement(r_statb, c_renamers) when r_shift='1' else r_statb;
-- Feed back unused registers back to the renamer
bakx_o : for b in 0 to c_back_wide-1 generate
dec : for i in 0 to c_renamers-1 generate
rename_bakx_o(i,b) <= r_bakx(i+c_renamers,b) when r_shift='1' else r_bakx(i,b);
end generate;
end generate;
-- Register the inputs with reset, with clock enable
skidpad : process(clk_i) is
begin
if rising_edge(clk_i) then
if s_shift = '1' then
if r_fault_out = '1' then
r_sp_geta <= (others => '0');
r_sp_getb <= (others => '0');
else
r_sp_geta <= rename_geta_i;
r_sp_getb <= rename_getb_i;
end if;
r_sp_bakx <= rename_bakx_i;
r_sp_baka <= rename_baka_i;
r_sp_bakb <= rename_bakb_i;
r_sp_aux <= f_opa_dup_row(c_renamers, rename_aux_i);
end if;
end if;
end process;
-- Register the stations 0-latency with reset, with load enable
stations_0rs : process(rst_n_i, clk_i) is
begin
if rst_n_i = '0' then -- asynchronous clear
r_issued <= (others => '1');
r_final <= (others => '1');
r_alias_valid <= (others => '0');
r_alias <= (others => '0');
r_old <= (others => '0');
r_oldest_candidate <= (others => '0');
elsif rising_edge(clk_i) then
if r_fault_pipe = '1' then -- synchronous clear
r_issued <= (others => '1');
r_final <= (others => '1');
r_alias_valid <= (others => '0');
r_alias <= (others => '0');
r_old <= (others => '0');
r_oldest_candidate <= (others => '0');
else
r_issued <= f_shift(s_new_issued, s_shift);
r_final <= f_shift(s_new_final, s_shift);
r_alias_valid <= f_shift(s_alias_valid, s_shift);
r_alias <= f_shift(s_alias and s_new_final, s_shift);
r_old <= s_old;
r_oldest_candidate <= s_oldest_candidate;
end if;
end if;
end process;
stations_0 : process(clk_i) is
begin
if rising_edge(clk_i) then
r_alias_addr <= f_opa_transpose(f_shift(f_opa_transpose(s_alias_addr), s_shift));
r_alias_mask <= f_opa_transpose(f_shift(f_opa_transpose(s_alias_mask), s_shift));
end if;
end process;
stations_0rl : process(rst_n_i, clk_i) is
begin
if rst_n_i = '0' then -- asynchronous clear
r_stata <= (others => (others => '1'));
r_statb <= (others => (others => '1'));
elsif rising_edge(clk_i) then
if s_shift = '1' then -- load enable
-- These two are sneaky; they are half lagged. Content lags thanks to s_stat[ab].
for i in 0 to c_num_stat-c_renamers-1 loop
for b in 0 to c_stat_wide-1 loop
r_stata(i,b) <= s_stata(i+c_renamers,b);
r_statb(i,b) <= s_statb(i+c_renamers,b);
end loop;
end loop;
if r_fault_out = '1' then
for i in c_num_stat-c_renamers to c_num_stat-1 loop
for b in 0 to c_stat_wide-1 loop
r_stata(i,b) <= '1';
r_statb(i,b) <= '1';
end loop;
end loop;
else
for i in c_num_stat-c_renamers to c_num_stat-1 loop
for b in 0 to c_stat_wide-1 loop
r_stata(i,b) <= rename_stata_i(i-(c_num_stat-c_renamers),b);
r_statb(i,b) <= rename_statb_i(i-(c_num_stat-c_renamers),b);
end loop;
end loop;
end if;
else
r_stata <= s_stata;
r_statb <= s_statb;
end if;
end if;
end process;
-- Register the stations, 0-latency with reset, with clock enable
stations_0rc : process(rst_n_i, clk_i) is
begin
if rst_n_i = '0' then
r_fast <= (others => '0');
r_slow <= (others => '0');
elsif rising_edge(clk_i) then
if s_shift = '1' then
if r_fault_out = '1' then
r_fast(c_num_stat-1 downto c_num_stat-c_renamers) <= (others => '0');
r_slow(c_num_stat-1 downto c_num_stat-c_renamers) <= (others => '0');
else
r_fast(c_num_stat-1 downto c_num_stat-c_renamers) <= rename_fast_i;
r_slow(c_num_stat-1 downto c_num_stat-c_renamers) <= rename_slow_i;
end if;
r_fast(c_num_stat-c_renamers-1 downto 0) <= r_fast(c_num_stat-1 downto c_renamers);
r_slow(c_num_stat-c_renamers-1 downto 0) <= r_slow(c_num_stat-1 downto c_renamers);
end if;
end if;
end process;
-- Register the stations, 1-latency with reset
s_wipe <= f_opa_dup_row(c_executers, r_wipe);
stations_1rs : process(clk_i, rst_n_i) is
begin
if rst_n_i = '0' then
r_ready <= (others => '1');
r_wipe <= (others => '0');
r_schedule0 <= (others => (others => '0'));
r_schedule1s <= (others => (others => '0'));
r_schedule2 <= (others => (others => '0'));
r_schedule3s <= (others => (others => '0'));
r_schedule4s <= (others => (others => '0'));
elsif rising_edge(clk_i) then
if r_fault_pipe = '1' then
r_ready <= (others => '1');
r_wipe <= (others => '0');
r_schedule0 <= (others => (others => '0'));
r_schedule1s <= (others => (others => '0'));
r_schedule2 <= (others => (others => '0'));
r_schedule3s <= (others => (others => '0'));
r_schedule4s <= (others => (others => '0'));
else
r_ready <= s_new_ready;
-- wipe does not need to consider r_alias; r_alias => r_final => not in schedule
r_wipe <= f_shift(s_nodep or s_retry, s_shift);
r_schedule0 <= f_opa_transpose(f_opa_concat(
f_opa_transpose(s_schedule_slow and f_opa_dup_row(c_num_slow, s_pending_slow)),
f_opa_transpose(s_schedule_fast and f_opa_dup_row(c_num_fast, s_pending_fast))));
r_schedule1s <= f_shift(f_shift(r_schedule0, r_shift) and not s_wipe, s_shift);
r_schedule2 <= r_schedule1s and not s_wipe;
r_schedule3s <= f_shift(f_shift(r_schedule2, r_shift) and not s_wipe, s_shift);
r_schedule4s <= f_shift(r_schedule3s and not s_wipe, s_shift);
end if;
end if;
end process;
stations_1r : process(clk_i, rst_n_i) is
begin
if rst_n_i = '0' then
r_shift <= '0';
elsif rising_edge(clk_i) then
r_shift <= s_shift;
end if;
end process;
-- Registers the stations, 1-latency without reset
stations_1 : process(clk_i) is
begin
if rising_edge(clk_i) then
-- These also need to consider all three sources of reissue as they feed s_issue
-- s_pending_{fast,slow} already covers s_nodep
-- both are just issued now, so s_retry is impossible (retry=>scheduled=>issued=>!not issued now)
-- r_alias => r_final => r_issued => not issued now
r_fast_issue <= s_fast_issue and s_pending_fast;
r_slow_issue <= s_slow_issue and s_pending_slow;
r_retry <= eu_retry_i;
end if;
end process;
-- Register the stations, 1-latency with reset, with clock enable
stations_1rc : process(clk_i, rst_n_i) is
begin
if rst_n_i = '0' then
r_bakx <= c_init_bak;
elsif rising_edge(clk_i) then
if r_shift = '1' then -- clock enable port
for i in 0 to c_num_stat-c_renamers-1 loop
for b in 0 to c_back_wide-1 loop
r_bakx(i,b) <= r_bakx(i+c_renamers,b);
end loop;
end loop;
for i in c_num_stat-c_renamers to c_num_stat-1 loop
for b in 0 to c_back_wide-1 loop
r_bakx(i,b) <= r_sp_bakx(i-(c_num_stat-c_renamers),b);
end loop;
end loop;
end if;
end if;
end process;
-- Register the stations, 1-latency without reset, with clock enable
stations_1c : process(clk_i) is
begin
if rising_edge(clk_i) then
if r_shift = '1' then -- clock enable port
for i in 0 to c_num_stat-c_renamers-1 loop
r_geta(i) <= r_geta(i+c_renamers);
r_getb(i) <= r_getb(i+c_renamers);
for b in 0 to c_aux_wide-1 loop
r_aux (i,b) <= r_aux (i+c_renamers,b);
end loop;
for b in 0 to c_back_wide-1 loop
r_baka(i,b) <= r_baka(i+c_renamers,b);
r_bakb(i,b) <= r_bakb(i+c_renamers,b);
end loop;
end loop;
for i in c_num_stat-c_renamers to c_num_stat-1 loop
r_geta(i) <= r_sp_geta(i-(c_num_stat-c_renamers));
r_getb(i) <= r_sp_getb(i-(c_num_stat-c_renamers));
for b in 0 to c_aux_wide-1 loop
r_aux (i,b) <= r_sp_aux (i-(c_num_stat-c_renamers),b);
end loop;
for b in 0 to c_back_wide-1 loop
r_baka(i,b) <= r_sp_baka(i-(c_num_stat-c_renamers),b);
r_bakb(i,b) <= r_sp_bakb(i-(c_num_stat-c_renamers),b);
end loop;