-
Notifications
You must be signed in to change notification settings - Fork 1
/
0001-BFQ-v9-20190204.patch
18511 lines (18486 loc) · 612 KB
/
0001-BFQ-v9-20190204.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 8d8d8f06cab2..41d0200944f1 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -1,3 +1,6 @@
+[ THIS TREE CONTAINS ALSO THE DEV VERSION OF BFQ.
+ DETAILS AT THE END OF THIS DOCUMENT. ]
+
BFQ (Budget Fair Queueing)
==========================
@@ -11,6 +14,15 @@ controllers), BFQ's main features are:
groups (switching back to time distribution when needed to keep
throughput high).
+If bfq-mq patches have been applied, then the following three
+instances of BFQ are available (otherwise only the first instance):
+- bfq: mainline version of BFQ, for blk-mq
+- bfq-mq: development version of BFQ for blk-mq; this version contains
+ also all latest features and fixes not yet landed in mainline, plus many
+ safety checks
+- bfq-sq: BFQ for legacy blk; also this version contains latest features
+ and fixes, as well as safety checks
+
In its default configuration, BFQ privileges latency over
throughput. So, when needed for achieving a lower latency, BFQ builds
schedules that may lead to a lower throughput. If your main or only
@@ -22,27 +34,42 @@ latency and throughput, or on how to maximize throughput.
BFQ has a non-null overhead, which limits the maximum IOPS that a CPU
can process for a device scheduled with BFQ. To give an idea of the
-limits on slow or average CPUs, here are, first, the limits of BFQ for
-three different CPUs, on, respectively, an average laptop, an old
-desktop, and a cheap embedded system, in case full hierarchical
-support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but
+limits on slow or average CPUs, here are, first, the limits of bfq-mq
+and bfq for three different CPUs, on, respectively, an average laptop,
+an old desktop, and a cheap embedded system, in case full hierarchical
+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for
+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but
CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2):
- Intel i7-4850HQ: 400 KIOPS
- AMD A8-3850: 250 KIOPS
- ARM CortexTM-A53 Octa-core: 80 KIOPS
-If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical
-support is enabled), then the sustainable throughput with BFQ
-decreases, because all blkio.bfq* statistics are created and updated
-(Section 4-2). For BFQ, this leads to the following maximum
-sustainable throughputs, on the same systems as above:
+As for bfq-sq, it cannot reach the above IOPS, because of the
+inherent, lower parallelism of legacy blk and of the components within
+it (including bfq-sq itself). In particular, results with
+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits
+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however
+provide a lower bound to the limits of bfq-sq.
+
+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and
+of course full hierarchical support is enabled), then the sustainable
+throughput with bfq-mq and bfq decreases, because all blkio.bfq*
+statistics are created and updated (Section 4-2). For bfq-mq and bfq,
+this leads to the following maximum sustainable throughputs, on the
+same systems as above:
- Intel i7-4850HQ: 310 KIOPS
- AMD A8-3850: 200 KIOPS
- ARM CortexTM-A53 Octa-core: 56 KIOPS
-BFQ works for multi-queue devices too.
+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical
+support is enabled), then bfq-sq exhibits the following limits:
+- Intel i7-4850HQ: 250 KIOPS
+- AMD A8-3850: 170 KIOPS
+- ARM CortexTM-A53 Octa-core: 45 KIOPS
-The table of contents follow. Impatients can just jump to Section 3.
+BFQ works for multi-queue devices too (bfq and bfq-mq instances).
+
+The table of contents follows. Impatients can just jump to Section 3.
CONTENTS
@@ -509,25 +536,27 @@ To get proportional sharing of bandwidth with BFQ for a given device,
BFQ must of course be the active scheduler for that device.
Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
+BFQ-specific cgroup parameters and stats begin with the "bfq.",
+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you
+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be ""
+(i.e., null string), "-sq" or "-mq". For example, the group parameter
+to set the weight of a group with the mainline BFQ is blkio.bfq.weight
or io.bfq.weight.
As for cgroups-v1 (blkio controller), the exact set of stat files
-created, and kept up-to-date by bfq, depends on whether
-CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all
+created, and kept up-to-date by bfq*, depends on whether
+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all
the stat files documented in
Documentation/cgroup-v1/blkio-controller.txt. If, instead,
-CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
-blkio.bfq.io_service_bytes
-blkio.bfq.io_service_bytes_recursive
-blkio.bfq.io_serviced
-blkio.bfq.io_serviced_recursive
+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files
+blkio.bfq*.io_service_bytes
+blkio.bfq*.io_service_bytes_recursive
+blkio.bfq*.io_serviced
+blkio.bfq*.io_serviced_recursive
The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum
-throughput sustainable with bfq, because updating the blkio.bfq.*
+throughput sustainable with bfq*, because updating the blkio.bfq*
stats is rather costly, especially for some of the stats enabled by
CONFIG_DEBUG_BLK_CGROUP.
@@ -536,7 +565,7 @@ Parameters to set
For each group, there is only the following parameter to set.
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the
group inside its parent. Available values: 1..10000 (default 100). The
linear mapping between ioprio and weights, described at the beginning
of the tunable section, is still valid, but all weights higher than
@@ -559,3 +588,55 @@ applications. Unset this tunable if you need/want to control weights.
Slightly extended version:
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
results.pdf
+
+----------------------------------------------------------------------
+
+DETAILS ON THE DEV VERSIONS IN THIS TREE
+
+The dev version of BFQ is available for both the legacy and the
+multi-queue block layers, as two additional I/O schedulers, named,
+respectively, bfq-sq-iosched and bfq-mq-iosched (the latter is
+available if also the changes introducing bfq-mq-iosched have been
+applied). In particular, this tree contains the dev version of bfq for
+Linux mainline 4.19.0, and has been obtained from the dev version for
+Linux 4.18.0. Rebasing from 4.18 to 4.19 involved two manual
+interventions.
+
+First, some conflicts had to be resolved, as follows:
+
+---------------------------------------------------------------
+
+diff --cc Makefile
+index 7727c1bf6fa5,69fa5c0310d8..c7cbdf0ad594
+--- a/Makefile
++++ b/Makefile
+@@@ -1,9 -1,9 +1,9 @@@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 4
+- PATCHLEVEL = 18
++ PATCHLEVEL = 19
+ SUBLEVEL = 0
+ -EXTRAVERSION =
+ +EXTRAVERSION = -bfq-mq
+- NAME = Merciless Moray
++ NAME = "People's Front"
+
+ # *DOCUMENTATION*
+ # To see a list of typical targets execute "make help"
+diff --cc include/linux/blkdev.h
+index 897c63322bd7,6980014357d4..8c4568ea6884
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@@ -56,7 -54,7 +54,7 @@@ struct blk_stat_callback
+ * Maximum number of blkcg policies allowed to be registered concurrently.
+ * Defined here to simplify include dependency.
+ */
+--#define BLKCG_MAX_POLS 5
+++#define BLKCG_MAX_POLS 7
+
+ typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+
+---------------------------------------------------------------
+
+Second, the following port commit had to be made:
+port commit "block: use ktime_get_ns() instead of sched_clock() for cfq and bfq"
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e32fc1f274d8..94cb28eb20ba 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -12,6 +12,11 @@ CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_CGROUPS=y
+CONFIG_BLK_CGROUP=y
+CONFIG_IOSCHED_BFQ_SQ=y
+CONFIG_BFQ_SQ_GROUP_IOSCHED=y
+CONFIG_MQ_IOSCHED_BFQ=y
+CONFIG_MQ_BFQ_GROUP_IOSCHED=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index a4a8914bf7a4..299a6861fb90 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED
---help---
Enable group IO scheduling in CFQ.
+config IOSCHED_BFQ_SQ
+ tristate "BFQ-SQ I/O scheduler"
+ default n
+ ---help---
+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for
+ SingleQueue) distributes bandwidth among all processes
+ according to their weights, regardless of the device
+ parameters and with any workload. It also guarantees a low
+ latency to interactive and soft real-time applications.
+ Details in Documentation/block/bfq-iosched.txt
+
+config BFQ_SQ_GROUP_IOSCHED
+ bool "BFQ-SQ hierarchical scheduling support"
+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP
+ default n
+ ---help---
+
+ Enable hierarchical scheduling in BFQ-SQ, using the blkio
+ (cgroups-v1) or io (cgroups-v2) controller.
+
choice
prompt "Default I/O scheduler"
@@ -54,6 +74,16 @@ choice
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
+ config DEFAULT_BFQ_SQ
+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y
+ help
+ Selects BFQ-SQ as the default I/O scheduler which will be
+ used by default for all block devices.
+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth
+ as desired, independently of the disk parameters and with
+ any workload. It also tries to guarantee low latency to
+ interactive and soft real-time applications.
+
config DEFAULT_NOOP
bool "No-op"
@@ -63,8 +93,28 @@ config DEFAULT_IOSCHED
string
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
+ default "bfq-sq" if DEFAULT_BFQ_SQ
default "noop" if DEFAULT_NOOP
+config MQ_IOSCHED_BFQ
+ tristate "BFQ-MQ I/O Scheduler"
+ default y
+ ---help---
+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth
+ among all processes according to their weights, regardless of
+ the device parameters and with any workload. It also
+ guarantees a low latency to interactive and soft real-time
+ applications. Details in Documentation/block/bfq-iosched.txt
+
+config MQ_BFQ_GROUP_IOSCHED
+ bool "BFQ-MQ hierarchical scheduling support"
+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP
+ default n
+ ---help---
+
+ Enable hierarchical scheduling in BFQ-MQ, using the blkio
+ (cgroups-v1) or io (cgroups-v2) controller.
+
config MQ_IOSCHED_DEADLINE
tristate "MQ deadline I/O scheduler"
default y
diff --git a/block/Makefile b/block/Makefile
index 572b33f32c07..1dd6ffdc2fee 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -25,6 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c
new file mode 100644
index 000000000000..15459e50cd6a
--- /dev/null
+++ b/block/bfq-cgroup-included.c
@@ -0,0 +1,1359 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <[email protected]>
+ *
+ * Copyright (C) 2008 Fabio Checconi <[email protected]>
+ * Paolo Valente <[email protected]>
+ *
+ * Copyright (C) 2015 Paolo Valente <[email protected]>
+ *
+ * Copyright (C) 2016 Paolo Valente <[email protected]>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ */
+
+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+ BFQG_stats_waiting = 0,
+ BFQG_stats_idling,
+ BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name) \
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags |= (1 << BFQG_stats_##name); \
+} \
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BFQG_stats_##name); \
+} \
+static int bfqg_stats_##name(struct bfqg_stats *stats) \
+{ \
+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
+} \
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+#ifdef BFQ_MQ
+/* This should be called with the scheduler lock held. */
+#else
+/* This should be called with the queue_lock held. */
+#endif
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+ u64 now;
+
+ if (!bfqg_stats_waiting(stats))
+ return;
+
+ now = ktime_get_ns();
+ if (now > stats->start_group_wait_time)
+ blkg_stat_add(&stats->group_wait_time,
+ now - stats->start_group_wait_time);
+ bfqg_stats_clear_waiting(stats);
+}
+
+#ifdef BFQ_MQ
+/* This should be called with the scheduler lock held. */
+#else
+/* This should be called with the queue_lock held. */
+#endif
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_waiting(stats))
+ return;
+ if (bfqg == curr_bfqg)
+ return;
+ stats->start_group_wait_time = ktime_get_ns();
+ bfqg_stats_mark_waiting(stats);
+}
+
+#ifdef BFQ_MQ
+/* This should be called with the scheduler lock held. */
+#else
+/* This should be called with the queue_lock held. */
+#endif
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+ u64 now;
+
+ if (!bfqg_stats_empty(stats))
+ return;
+
+ now = ktime_get_ns();
+ if (now > stats->start_empty_time)
+ blkg_stat_add(&stats->empty_time,
+ now - stats->start_empty_time);
+ bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+ blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (blkg_rwstat_total(&stats->queued))
+ return;
+
+ /*
+ * group is already marked empty. This can happen if bfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if (bfqg_stats_empty(stats))
+ return;
+
+ stats->start_empty_time = ktime_get_ns();
+ bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_idling(stats)) {
+ u64 now = ktime_get_ns();
+
+ if (now > stats->start_idle_time)
+ blkg_stat_add(&stats->idle_time,
+ now - stats->start_idle_time);
+ bfqg_stats_clear_idling(stats);
+ }
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ stats->start_idle_time = ktime_get_ns();
+ bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ blkg_stat_add(&stats->avg_queue_size_sum,
+ blkg_rwstat_total(&stats->queued));
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfqg_stats_update_group_wait_time(stats);
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+ struct bfq_queue *bfqq,
+ unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ bfqg_stats_end_empty_time(&bfqg->stats);
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+ u64 start_time_ns,
+ u64 io_start_time_ns,
+ unsigned int op)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+ u64 now = ktime_get_ns();
+
+ if (now > io_start_time_ns)
+ blkg_rwstat_add(&stats->service_time, op,
+ now - io_start_time_ns);
+ if (io_start_time_ns > start_time_ns)
+ blkg_rwstat_add(&stats->wait_time, op,
+ io_start_time_ns - start_time_ns);
+}
+
+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+ struct bfq_queue *bfqq, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+ u64 start_time_ns,
+ u64 io_start_time_ns,
+ unsigned int op) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */
+
+#ifdef BFQ_GROUP_IOSCHED_ENABLED
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+ return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
+
+ return pd_to_bfqg(pd);
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ return group_entity ? container_of(group_entity, struct bfq_group,
+ entity) :
+ bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+#ifdef BFQ_MQ
+ bfqg->ref++;
+#else
+ blkg_get(bfqg_to_blkg(bfqg));
+#endif
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+#ifdef BFQ_MQ
+ bfqg->ref--;
+
+ BUG_ON(bfqg->ref < 0);
+ if (bfqg->ref == 0)
+ kfree(bfqg);
+#else
+ blkg_put(bfqg_to_blkg(bfqg));
+#endif
+}
+
+#ifdef BFQ_MQ
+static void bfqg_and_blkg_get(struct bfq_group *bfqg)
+{
+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
+ bfqg_get(bfqg);
+
+ blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_and_blkg_put(struct bfq_group *bfqg)
+{
+ blkg_put(bfqg_to_blkg(bfqg));
+
+ bfqg_put(bfqg);
+}
+#endif
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_reset(&stats->merged);
+ blkg_rwstat_reset(&stats->service_time);
+ blkg_rwstat_reset(&stats->wait_time);
+ blkg_stat_reset(&stats->time);
+ blkg_stat_reset(&stats->avg_queue_size_sum);
+ blkg_stat_reset(&stats->avg_queue_size_samples);
+ blkg_stat_reset(&stats->dequeue);
+ blkg_stat_reset(&stats->group_wait_time);
+ blkg_stat_reset(&stats->idle_time);
+ blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+ if (!to || !from)
+ return;
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples,
+ &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+#endif
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+ struct bfq_group *parent;
+
+ if (!bfqg) /* root_group */
+ return;
+
+ parent = bfqg_parent(bfqg);
+
+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+ struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+#ifdef BFQ_MQ
+ /*
+ * Make sure that bfqg and its associated blkg do not
+ * disappear before entity.
+ */
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n",
+ bfqg);
+
+ bfqg_and_blkg_get(bfqg);
+#else
+ bfqg_get(bfqg);
+#endif
+ }
+ entity->parent = bfqg->my_entity; /* NULL for root group */
+ entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+#endif
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp)) {
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
+ }
+#endif
+
+ return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+ struct bfq_group_data *bgd;
+
+ bgd = kzalloc(sizeof(*bgd), gfp);
+ if (!bgd)
+ return NULL;
+ return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct bfq_group *bfqg;
+
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ if (!bfqg)
+ return NULL;
+
+ if (bfqg_stats_init(&bfqg->stats, gfp)) {
+ kfree(bfqg);
+ return NULL;
+ }
+#ifdef BFQ_MQ
+ /* see comments in bfq_bic_update_cgroup for why refcounting */
+ bfqg_get(bfqg);
+#endif
+ return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct blkcg_gq *blkg;
+ struct bfq_group *bfqg;
+ struct bfq_data *bfqd;
+ struct bfq_entity *entity;
+ struct bfq_group_data *d;
+
+ blkg = pd_to_blkg(pd);
+ BUG_ON(!blkg);
+ bfqg = blkg_to_bfqg(blkg);
+ bfqd = blkg->q->elevator->elevator_data;
+ BUG_ON(bfqg == bfqd->root_group);
+ entity = &bfqg->entity;
+ d = blkcg_to_bfqgd(blkg->blkcg);
+
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+ entity->my_sched_data = &bfqg->sched_data;
+ bfqg->my_entity = entity; /*
+ * the root_group's will be set to NULL
+ * in bfq_init_queue()
+ */
+ bfqg->bfqd = bfqd;
+ bfqg->active_entities = 0;
+ bfqg->rq_pos_tree = RB_ROOT;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_exit(&bfqg->stats);
+#ifdef BFQ_MQ
+ bfqg_put(bfqg);
+#else
+ kfree(bfqg);
+#endif
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+ struct bfq_group *parent)
+{
+ struct bfq_entity *entity;
+
+ BUG_ON(!parent);
+ BUG_ON(!bfqg);
+ BUG_ON(bfqg == parent);
+
+ entity = &bfqg->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = blkg_lookup(blkcg, bfqd->queue);
+ if (likely(blkg))
+ return blkg_to_bfqg(blkg);
+ return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct bfq_group *bfqg, *parent;
+ struct bfq_entity *entity;
+
+ bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ return NULL;
+
+ /*
+ * Update chain of bfq_groups as we might be handling a leaf group
+ * which, along with some of its relatives, has not been hooked yet
+ * to the private hierarchy of BFQ.
+ */
+ entity = &bfqg->entity;
+ for_each_entity(entity) {
+ bfqg = container_of(entity, struct bfq_group, entity);
+ BUG_ON(!bfqg);
+ if (bfqg != bfqd->root_group) {
+ parent = bfqg_parent(bfqg);
+ if (!parent)
+ parent = bfqd->root_group;
+ BUG_ON(!parent);
+ bfq_group_set_parent(bfqg, parent);
+ }
+ }
+
+ return bfqg;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one. Avoid putting the entity on the old group idle tree.
+ *
+#ifdef BFQ_MQ
+ * Must be called under the scheduler lock, to make sure that the blkg
+ * owning @bfqg does not disappear (see comments in
+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg
+ * objects).
+#else
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+#endif
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
+ && entity->on_st &&
+ bfqq != bfqd->in_service_queue);
+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
+
+ /* If bfqq is empty, then bfq_bfqq_expire also invokes
+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+ * from data structures related to current group. Otherwise we
+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+ * we do below.
+ */
+ if (bfqq == bfqd->in_service_queue)
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQ_BFQQ_PREEMPTED);
+
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+ && &bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+
+ if (bfq_bfqq_busy(bfqq))
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ else if (entity->on_st) {
+ BUG_ON(&bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+ }
+#ifdef BFQ_MQ
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg);
+
+ bfqg_and_blkg_put(bfqq_group(bfqq));
+#else
+ bfqg_put(bfqq_group(bfqq));
+#endif
+
+ entity->parent = bfqg->my_entity;
+ entity->sched_data = &bfqg->sched_data;
+#ifdef BFQ_MQ
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg);
+
+ /* pin down bfqg and its associated blkg */
+ bfqg_and_blkg_get(bfqg);
+#else
+ bfqg_get(bfqg);
+#endif
+
+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+ if (bfq_bfqq_busy(bfqq)) {
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_activate_bfqq(bfqd, bfqq);
+ }
+
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ bfq_schedule_dispatch(bfqd);
+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+ && &bfq_entity_service_tree(entity)->idle !=
+ entity->tree);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+#ifdef BFQ_MQ
+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes
+ * sure that the reference to cgroup is valid across the call (see
+ * comments in bfq_bic_update_cgroup on this issue)
+#else
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.