From 888581c83de3ae7aae40e23a8cc9e4bd331d9803 Mon Sep 17 00:00:00 2001
From: HunDunDM <hundundm@gmail.com>
Date: Wed, 28 Jul 2021 16:16:10 +0800
Subject: [PATCH] scheduler: balance-hot-region-scheduler supports TiFlash
 (#3900)

---
 server/schedulers/hot_region.go         | 274 +++++++++---------------
 server/schedulers/hot_region_config.go  |  22 +-
 server/schedulers/hot_region_test.go    |  72 +++----
 server/schedulers/shuffle_hot_region.go |   8 +-
 server/schedulers/utils.go              | 206 ++++++++++++++----
 tests/pdctl/scheduler/scheduler_test.go |   1 +
 6 files changed, 328 insertions(+), 255 deletions(-)

diff --git a/server/schedulers/hot_region.go b/server/schedulers/hot_region.go
index 7129676d578e..7bffd0748cbf 100644
--- a/server/schedulers/hot_region.go
+++ b/server/schedulers/hot_region.go
@@ -77,17 +77,18 @@ type hotScheduler struct {
 	types []rwType
 	r     *rand.Rand
 
-	// states across multiple `Schedule` calls
-	pendings map[*pendingInfluence]struct{}
-	// regionPendings stores regionID -> Operator
+	// regionPendings stores regionID -> pendingInfluence
 	// this records regionID which have pending Operator by operation type. During filterHotPeers, the hot peers won't
 	// be selected if its owner region is tracked in this attribute.
-	regionPendings map[uint64]*operator.Operator
+	regionPendings map[uint64]*pendingInfluence
 
+	// store information, including pending Influence by resource type
+	// Every time Schedule will recalculate it.
+	stInfos map[uint64]*storeInfo
 	// temporary states but exported to API or metrics
+	// Every time Schedule will recalculate it.
 	stLoadInfos [resourceTypeLen]map[uint64]*storeLoadDetail
-	// This stores the pending Influence for each store by resource type.
-	pendingSums map[uint64]*Influence
+
 	// config of hot scheduler
 	conf *hotRegionSchedulerConfig
 }
@@ -99,8 +100,7 @@ func newHotScheduler(opController *schedule.OperatorController, conf *hotRegionS
 		BaseScheduler:  base,
 		types:          []rwType{write, read},
 		r:              rand.New(rand.NewSource(time.Now().UnixNano())),
-		pendings:       map[*pendingInfluence]struct{}{},
-		regionPendings: make(map[uint64]*operator.Operator),
+		regionPendings: make(map[uint64]*pendingInfluence),
 		conf:           conf,
 	}
 	for ty := resourceType(0); ty < resourceTypeLen; ty++ {
@@ -159,23 +159,20 @@ func (h *hotScheduler) dispatch(typ rwType, cluster opt.Cluster) []*operator.Ope
 // prepareForBalance calculate the summary of pending Influence for each store and prepare the load detail for
 // each store
 func (h *hotScheduler) prepareForBalance(cluster opt.Cluster) {
+	h.stInfos = summaryStoreInfos(cluster)
 	h.summaryPendingInfluence()
-
-	stores := cluster.GetStores()
 	storesLoads := cluster.GetStoresLoads()
 
 	{ // update read statistics
 		regionRead := cluster.RegionReadStats()
 		h.stLoadInfos[readLeader] = summaryStoresLoad(
-			stores,
+			h.stInfos,
 			storesLoads,
-			h.pendingSums,
 			regionRead,
 			read, core.LeaderKind)
 		h.stLoadInfos[readPeer] = summaryStoresLoad(
-			stores,
+			h.stInfos,
 			storesLoads,
-			h.pendingSums,
 			regionRead,
 			read, core.RegionKind)
 	}
@@ -183,16 +180,13 @@ func (h *hotScheduler) prepareForBalance(cluster opt.Cluster) {
 	{ // update write statistics
 		regionWrite := cluster.RegionWriteStats()
 		h.stLoadInfos[writeLeader] = summaryStoresLoad(
-			stores,
+			h.stInfos,
 			storesLoads,
-			h.pendingSums,
 			regionWrite,
 			write, core.LeaderKind)
-
 		h.stLoadInfos[writePeer] = summaryStoresLoad(
-			stores,
+			h.stInfos,
 			storesLoads,
-			h.pendingSums,
 			regionWrite,
 			write, core.RegionKind)
 	}
@@ -200,143 +194,37 @@ func (h *hotScheduler) prepareForBalance(cluster opt.Cluster) {
 
 // summaryPendingInfluence calculate the summary of pending Influence for each store
 // and clean the region from regionInfluence if they have ended operator.
+// It makes each key/byte rate or count become (1+w) times to the origin value while f is the function to provide w(weight).
 func (h *hotScheduler) summaryPendingInfluence() {
-	h.pendingSums = summaryPendingInfluence(h.pendings, h.calcPendingWeight)
-	h.gcRegionPendings()
-}
-
-// gcRegionPendings check the region whether it need to be deleted from regionPendings depended on whether it have
-// ended operator
-func (h *hotScheduler) gcRegionPendings() {
-	for regionID, op := range h.regionPendings {
-		if op != nil && op.IsEnd() {
-			if time.Now().After(op.GetCreateTime().Add(h.conf.GetMaxZombieDuration())) {
-				log.Debug("gc pending influence in hot region scheduler", zap.Uint64("region-id", regionID), zap.Time("create", op.GetCreateTime()), zap.Time("now", time.Now()), zap.Duration("zombie", h.conf.GetMaxZombieDuration()))
-				schedulerStatus.WithLabelValues(h.GetName(), "pending_op_infos").Dec()
-				delete(h.regionPendings, regionID)
-			}
+	storeZombieDur := h.conf.GetStoreStatZombieDuration()
+	regionsZombieDur := h.conf.GetRegionsStatZombieDuration()
+
+	for id, p := range h.regionPendings {
+		from := h.stInfos[p.from]
+		to := h.stInfos[p.to]
+		maxZombieDur := storeZombieDur
+		if (from != nil && from.IsTiFlash) || (to != nil && to.IsTiFlash) {
+			maxZombieDur = regionsZombieDur
 		}
-	}
-}
-
-// summaryStoresLoad Load information of all available stores.
-// it will filtered the hot peer and calculate the current and future stat(byte/key rate,count) for each store
-func summaryStoresLoad(
-	stores []*core.StoreInfo,
-	storesLoads map[uint64][]float64,
-	storePendings map[uint64]*Influence,
-	storeHotPeers map[uint64][]*statistics.HotPeerStat,
-	rwTy rwType,
-	kind core.ResourceKind,
-) map[uint64]*storeLoadDetail {
-	// loadDetail stores the storeID -> hotPeers stat and its current and future stat(key/byte rate,count)
-	loadDetail := make(map[uint64]*storeLoadDetail, len(storesLoads))
-	allLoadSum := make([]float64, statistics.DimLen)
-	allCount := 0.0
-
-	// Stores without byte rate statistics is not available to schedule.
-	for _, store := range stores {
-		id := store.GetID()
-		storeLoads, ok := storesLoads[id]
-		if !ok {
+		weight, needGC := h.calcPendingInfluence(p.op, maxZombieDur)
+
+		if needGC {
+			delete(h.regionPendings, id)
+			log.Debug("gc pending influence in hot region scheduler",
+				zap.Uint64("region-id", id),
+				zap.Time("create", p.op.GetCreateTime()),
+				zap.Time("now", time.Now()),
+				zap.Duration("zombie", maxZombieDur))
 			continue
 		}
-		loads := make([]float64, statistics.DimLen)
-		switch rwTy {
-		case read:
-			loads[statistics.ByteDim] = storeLoads[statistics.StoreReadBytes]
-			loads[statistics.KeyDim] = storeLoads[statistics.StoreReadKeys]
-		case write:
-			loads[statistics.ByteDim] = storeLoads[statistics.StoreWriteBytes]
-			loads[statistics.KeyDim] = storeLoads[statistics.StoreWriteKeys]
-		}
-		// Find all hot peers first
-		var hotPeers []*statistics.HotPeerStat
-		{
-			peerLoadSum := make([]float64, statistics.DimLen)
-			// TODO: To remove `filterHotPeers`, we need to:
-			// HotLeaders consider `Write{Bytes,Keys}`, so when we schedule `writeLeader`, all peers are leader.
-			for _, peer := range filterHotPeers(kind, storeHotPeers[id]) {
-				for i := range peerLoadSum {
-					peerLoadSum[i] += peer.GetLoad(getRegionStatKind(rwTy, i))
-				}
-				hotPeers = append(hotPeers, peer.Clone())
-			}
-			// Use sum of hot peers to estimate leader-only byte rate.
-			// For write requests, Write{Bytes, Keys} is applied to all Peers at the same time, while the Leader and Follower are under different loads (usually the Leader consumes more CPU).
-			// But none of the current dimension reflect this difference, so we create a new dimension to reflect it.
-			if kind == core.LeaderKind && rwTy == write {
-				loads = peerLoadSum
-			}
 
-			// Metric for debug.
-			{
-				ty := "byte-rate-" + rwTy.String() + "-" + kind.String()
-				hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(peerLoadSum[statistics.ByteDim])
-			}
-			{
-				ty := "key-rate-" + rwTy.String() + "-" + kind.String()
-				hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(peerLoadSum[statistics.KeyDim])
-			}
-		}
-		for i := range allLoadSum {
-			allLoadSum[i] += loads[i]
-		}
-		allCount += float64(len(hotPeers))
-
-		// Build store load prediction from current load and pending influence.
-		stLoadPred := (&storeLoad{
-			Loads: loads,
-			Count: float64(len(hotPeers)),
-		}).ToLoadPred(rwTy, storePendings[id])
-
-		// Construct store load info.
-		loadDetail[id] = &storeLoadDetail{
-			Store:    store,
-			LoadPred: stLoadPred,
-			HotPeers: hotPeers,
-		}
-	}
-	storeLen := float64(len(storesLoads))
-	// store expectation byte/key rate and count for each store-load detail.
-	for id, detail := range loadDetail {
-		expectLoads := make([]float64, len(allLoadSum))
-		for i := range expectLoads {
-			expectLoads[i] = allLoadSum[i] / storeLen
+		if from != nil && weight > 0 {
+			from.addInfluence(&p.origin, -weight)
 		}
-		expectCount := allCount / storeLen
-		detail.LoadPred.Expect.Loads = expectLoads
-		detail.LoadPred.Expect.Count = expectCount
-		// Debug
-		{
-			ty := "exp-byte-rate-" + rwTy.String() + "-" + kind.String()
-			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectLoads[statistics.ByteDim])
-		}
-		{
-			ty := "exp-key-rate-" + rwTy.String() + "-" + kind.String()
-			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectLoads[statistics.KeyDim])
-		}
-		{
-			ty := "exp-count-rate-" + rwTy.String() + "-" + kind.String()
-			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectCount)
-		}
-	}
-	return loadDetail
-}
-
-// filterHotPeers filter the peer whose hot degree is less than minHotDegress
-func filterHotPeers(
-	kind core.ResourceKind,
-	peers []*statistics.HotPeerStat,
-) []*statistics.HotPeerStat {
-	ret := make([]*statistics.HotPeerStat, 0, len(peers))
-	for _, peer := range peers {
-		if kind == core.LeaderKind && !peer.IsLeader() {
-			continue
+		if to != nil && weight > 0 {
+			to.addInfluence(&p.origin, weight)
 		}
-		ret = append(ret, peer)
 	}
-	return ret
 }
 
 func (h *hotScheduler) addPendingInfluence(op *operator.Operator, srcStore, dstStore uint64, infl Influence) bool {
@@ -348,8 +236,7 @@ func (h *hotScheduler) addPendingInfluence(op *operator.Operator, srcStore, dstS
 	}
 
 	influence := newPendingInfluence(op, srcStore, dstStore, infl)
-	h.pendings[influence] = struct{}{}
-	h.regionPendings[regionID] = op
+	h.regionPendings[regionID] = influence
 
 	schedulerStatus.WithLabelValues(h.GetName(), "pending_op_infos").Inc()
 	return true
@@ -538,13 +425,26 @@ func (bs *balanceSolver) solve() []*operator.Operator {
 // its expectation * ratio, the store would be selected as hot source store
 func (bs *balanceSolver) filterSrcStores() map[uint64]*storeLoadDetail {
 	ret := make(map[uint64]*storeLoadDetail)
+	confSrcToleranceRatio := bs.sche.conf.GetSrcToleranceRatio()
+	confEnableForTiFlash := bs.sche.conf.GetEnableForTiFlash()
 	for id, detail := range bs.stLoadDetail {
+		srcToleranceRatio := confSrcToleranceRatio
+		if detail.Info.IsTiFlash {
+			if !confEnableForTiFlash {
+				continue
+			}
+			if bs.rwTy != write || bs.opTy != movePeer {
+				continue
+			}
+			srcToleranceRatio = 0
+		}
 		if len(detail.HotPeers) == 0 {
 			continue
 		}
+
 		minLoad := detail.LoadPred.min()
 		if slice.AllOf(minLoad.Loads, func(i int) bool {
-			return minLoad.Loads[i] > bs.sche.conf.GetSrcToleranceRatio()*detail.LoadPred.Expect.Loads[i]
+			return minLoad.Loads[i] > srcToleranceRatio*detail.LoadPred.Expect.Loads[i]
 		}) {
 			ret[id] = detail
 			hotSchedulerResultCounter.WithLabelValues("src-store-succ", strconv.FormatUint(id, 10)).Inc()
@@ -629,10 +529,11 @@ func (bs *balanceSolver) isRegionAvailable(region *core.RegionInfo) bool {
 		return false
 	}
 
-	if op, ok := bs.sche.regionPendings[region.GetID()]; ok {
+	if influence, ok := bs.sche.regionPendings[region.GetID()]; ok {
 		if bs.opTy == transferLeader {
 			return false
 		}
+		op := influence.op
 		if op.Kind()&operator.OpRegion != 0 ||
 			(op.Kind()&operator.OpLeader != 0 && !op.IsEnd()) {
 			return false
@@ -684,7 +585,7 @@ func (bs *balanceSolver) filterDstStores() map[uint64]*storeLoadDetail {
 		filters    []filter.Filter
 		candidates []*storeLoadDetail
 	)
-	srcStore := bs.stLoadDetail[bs.cur.srcStoreID].Store
+	srcStore := bs.stLoadDetail[bs.cur.srcStoreID].Info.Store
 	switch bs.opTy {
 	case movePeer:
 		filters = []filter.Filter{
@@ -721,9 +622,20 @@ func (bs *balanceSolver) filterDstStores() map[uint64]*storeLoadDetail {
 
 func (bs *balanceSolver) pickDstStores(filters []filter.Filter, candidates []*storeLoadDetail) map[uint64]*storeLoadDetail {
 	ret := make(map[uint64]*storeLoadDetail, len(candidates))
-	dstToleranceRatio := bs.sche.conf.GetDstToleranceRatio()
+	confDstToleranceRatio := bs.sche.conf.GetDstToleranceRatio()
+	confEnableForTiFlash := bs.sche.conf.GetEnableForTiFlash()
 	for _, detail := range candidates {
-		store := detail.Store
+		store := detail.Info.Store
+		dstToleranceRatio := confDstToleranceRatio
+		if detail.Info.IsTiFlash {
+			if !confEnableForTiFlash {
+				continue
+			}
+			if bs.rwTy != write || bs.opTy != movePeer {
+				continue
+			}
+			dstToleranceRatio = 0
+		}
 		if filter.Target(bs.cluster.GetOpts(), store, filters) {
 			maxLoads := detail.LoadPred.max().Loads
 			if slice.AllOf(maxLoads, func(i int) bool {
@@ -1057,40 +969,52 @@ func (h *hotScheduler) GetHotStatus(typ string) *statistics.StoreHotPeersInfos {
 func (h *hotScheduler) GetPendingInfluence() map[uint64]*Influence {
 	h.RLock()
 	defer h.RUnlock()
-	ret := make(map[uint64]*Influence, len(h.pendingSums))
-	for id, infl := range h.pendingSums {
-		ret[id] = infl.add(infl, 0) // copy
+	ret := make(map[uint64]*Influence, len(h.stInfos))
+	for id, info := range h.stInfos {
+		if info.PendingSum != nil {
+			ret[id] = info.PendingSum
+		}
 	}
 	return ret
 }
 
-// calcPendingWeight return the calculate weight of one Operator, the value will between [0,1]
-func (h *hotScheduler) calcPendingWeight(op *operator.Operator) float64 {
-	if op.CheckExpired() || op.CheckTimeout() {
-		return 0
-	}
+// calcPendingInfluence return the calculate weight of one Operator, the value will between [0,1]
+func (h *hotScheduler) calcPendingInfluence(op *operator.Operator, maxZombieDur time.Duration) (weight float64, needGC bool) {
 	status := op.Status()
+	switch {
+	case op.CheckExpired():
+		status = operator.EXPIRED
+	case op.CheckTimeout():
+		status = operator.TIMEOUT
+	}
+
 	if !operator.IsEndStatus(status) {
-		return 1
+		return 1, false
+	}
+
+	zombieDur := time.Since(op.GetReachTimeOf(status))
+	if zombieDur >= maxZombieDur {
+		weight = 0
+	} else {
+		// TODO: use store statistics update time to make a more accurate estimation
+		// weight = float64(maxZombieDur-zombieDur) / float64(maxZombieDur)
+		weight = 1
 	}
+
 	switch status {
 	case operator.SUCCESS:
-		zombieDur := time.Since(op.GetReachTimeOf(status))
-		maxZombieDur := h.conf.GetMaxZombieDuration()
-		if zombieDur >= maxZombieDur {
-			return 0
-		}
-		// TODO: use store statistics update time to make a more accurate estimation
-		return float64(maxZombieDur-zombieDur) / float64(maxZombieDur)
+		return weight, weight == 0
+	case operator.CANCELED, operator.REPLACED, operator.TIMEOUT:
+		return 0, weight == 0
+	case operator.EXPIRED:
+		fallthrough
 	default:
-		return 0
+		return 0, true
 	}
 }
 
 func (h *hotScheduler) clearPendingInfluence() {
-	h.pendings = map[*pendingInfluence]struct{}{}
-	h.pendingSums = nil
-	h.regionPendings = make(map[uint64]*operator.Operator)
+	h.regionPendings = make(map[uint64]*pendingInfluence)
 }
 
 // rwType : the perspective of balance
diff --git a/server/schedulers/hot_region_config.go b/server/schedulers/hot_region_config.go
index afaa261cce2a..17bf1af3e525 100644
--- a/server/schedulers/hot_region_config.go
+++ b/server/schedulers/hot_region_config.go
@@ -36,14 +36,15 @@ func initHotRegionScheduleConfig() *hotRegionSchedulerConfig {
 		MinHotByteRate:        100,
 		MinHotKeyRate:         10,
 		MaxZombieRounds:       3,
+		MaxPeerNum:            1000,
 		ByteRateRankStepRatio: 0.05,
 		KeyRateRankStepRatio:  0.05,
 		CountRankStepRatio:    0.01,
 		GreatDecRatio:         0.95,
 		MinorDecRatio:         0.99,
-		MaxPeerNum:            1000,
 		SrcToleranceRatio:     1.05, // Tolerate 5% difference
 		DstToleranceRatio:     1.05, // Tolerate 5% difference
+		EnableForTiFlash:      true,
 	}
 }
 
@@ -65,6 +66,9 @@ type hotRegionSchedulerConfig struct {
 	MinorDecRatio         float64 `json:"minor-dec-ratio"`
 	SrcToleranceRatio     float64 `json:"src-tolerance-ratio"`
 	DstToleranceRatio     float64 `json:"dst-tolerance-ratio"`
+
+	// Separately control whether to start hotspot scheduling for TiFlash
+	EnableForTiFlash bool `json:"enable-for-tiflash"`
 }
 
 func (conf *hotRegionSchedulerConfig) EncodeConfig() ([]byte, error) {
@@ -73,12 +77,19 @@ func (conf *hotRegionSchedulerConfig) EncodeConfig() ([]byte, error) {
 	return schedule.EncodeConfig(conf)
 }
 
-func (conf *hotRegionSchedulerConfig) GetMaxZombieDuration() time.Duration {
+func (conf *hotRegionSchedulerConfig) GetStoreStatZombieDuration() time.Duration {
 	conf.RLock()
 	defer conf.RUnlock()
 	return time.Duration(conf.MaxZombieRounds) * statistics.StoreHeartBeatReportInterval * time.Second
 }
 
+func (conf *hotRegionSchedulerConfig) GetRegionsStatZombieDuration() time.Duration {
+	conf.RLock()
+	defer conf.RUnlock()
+	// Since RegionsStatsRollingWindowsSize is very large, multiply by 2.
+	return time.Duration(conf.MaxZombieRounds) * 2 * statistics.RegionHeartBeatReportInterval * time.Second
+}
+
 func (conf *hotRegionSchedulerConfig) GetMaxPeerNumber() int {
 	conf.RLock()
 	defer conf.RUnlock()
@@ -151,6 +162,12 @@ func (conf *hotRegionSchedulerConfig) GetMinHotByteRate() float64 {
 	return conf.MinHotByteRate
 }
 
+func (conf *hotRegionSchedulerConfig) GetEnableForTiFlash() bool {
+	conf.RLock()
+	defer conf.RUnlock()
+	return conf.EnableForTiFlash
+}
+
 func (conf *hotRegionSchedulerConfig) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	router := mux.NewRouter()
 	router.HandleFunc("/list", conf.handleGetConfig).Methods("GET")
@@ -214,5 +231,4 @@ func (conf *hotRegionSchedulerConfig) persist() error {
 
 	}
 	return conf.storage.SaveScheduleConfig(HotRegionName, data)
-
 }
diff --git a/server/schedulers/hot_region_test.go b/server/schedulers/hot_region_test.go
index 24448bc310f1..334c3210183a 100644
--- a/server/schedulers/hot_region_test.go
+++ b/server/schedulers/hot_region_test.go
@@ -82,7 +82,7 @@ func (s *testHotSchedulerSuite) TestGCPendingOpInfos(c *C) {
 	c.Assert(err, IsNil)
 	hb := sche.(*hotScheduler)
 
-	notDoneOp := func(region *core.RegionInfo, ty opType) *operator.Operator {
+	notDoneOp := func(region *core.RegionInfo, ty opType) *pendingInfluence {
 		var op *operator.Operator
 		var err error
 		switch ty {
@@ -93,40 +93,40 @@ func (s *testHotSchedulerSuite) TestGCPendingOpInfos(c *C) {
 		}
 		c.Assert(err, IsNil)
 		c.Assert(op, NotNil)
-		return op
+		return newPendingInfluence(op, 2, 4, Influence{})
 	}
-	doneOp := func(region *core.RegionInfo, ty opType) *operator.Operator {
-		op := notDoneOp(region, ty)
-		op.Cancel()
-		return op
+	doneOp := func(region *core.RegionInfo, ty opType) *pendingInfluence {
+		infl := notDoneOp(region, ty)
+		infl.op.Cancel()
+		return infl
 	}
-	shouldRemoveOp := func(region *core.RegionInfo, ty opType) *operator.Operator {
-		op := doneOp(region, ty)
-		operator.SetOperatorStatusReachTime(op, operator.CREATED, time.Now().Add(-3*statistics.StoreHeartBeatReportInterval*time.Second))
-		return op
+	shouldRemoveOp := func(region *core.RegionInfo, ty opType) *pendingInfluence {
+		infl := doneOp(region, ty)
+		operator.SetOperatorStatusReachTime(infl.op, operator.CANCELED, time.Now().Add(-3*statistics.StoreHeartBeatReportInterval*time.Second))
+		return infl
 	}
-	opCreaters := [3]func(region *core.RegionInfo, ty opType) *operator.Operator{shouldRemoveOp, notDoneOp, doneOp}
+	opCreaters := [3]func(region *core.RegionInfo, ty opType) *pendingInfluence{shouldRemoveOp, notDoneOp, doneOp}
 
 	typs := []opType{movePeer, transferLeader}
 
-	for i := 0; i < len(opCreaters); i++ {
+	for i, opCreator := range opCreaters {
 		for j, typ := range typs {
-			regionID := uint64(i*len(opCreaters) + j + 1)
+			regionID := uint64(i*len(typs) + j + 1)
 			region := newTestRegion(regionID)
-			hb.regionPendings[regionID] = opCreaters[i](region, typ)
+			hb.regionPendings[regionID] = opCreator(region, typ)
 		}
 	}
 
-	hb.gcRegionPendings()
+	hb.summaryPendingInfluence() // Calling this function will GC.
 
-	for i := 0; i < len(opCreaters); i++ {
+	for i := range opCreaters {
 		for j, typ := range typs {
-			regionID := uint64(i*len(opCreaters) + j + 1)
+			regionID := uint64(i*len(typs) + j + 1)
 			if i < 1 { // shouldRemoveOp
 				c.Assert(hb.regionPendings, Not(HasKey), regionID)
 			} else { // notDoneOp, doneOp
 				c.Assert(hb.regionPendings, HasKey, regionID)
-				kind := hb.regionPendings[regionID].Kind()
+				kind := hb.regionPendings[regionID].op.Kind()
 				switch typ {
 				case transferLeader:
 					c.Assert(kind&operator.OpLeader != 0, IsTrue)
@@ -1373,15 +1373,15 @@ func (s *testInfluenceSerialSuite) TestInfluenceByRWType(c *C) {
 	op := hb.Schedule(tc)[0]
 	c.Assert(op, NotNil)
 	hb.(*hotScheduler).summaryPendingInfluence()
-	pendingInfluence := hb.(*hotScheduler).pendingSums
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionWriteKeys], -0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionWriteBytes], -0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[4].Loads[statistics.RegionWriteKeys], 0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[4].Loads[statistics.RegionWriteBytes], 0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionReadKeys], -0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionReadBytes], -0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[4].Loads[statistics.RegionReadKeys], 0.5*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[4].Loads[statistics.RegionReadBytes], 0.5*MB), IsTrue)
+	stInfos := hb.(*hotScheduler).stInfos
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionWriteKeys], -0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionWriteBytes], -0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[4].PendingSum.Loads[statistics.RegionWriteKeys], 0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[4].PendingSum.Loads[statistics.RegionWriteBytes], 0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionReadKeys], -0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionReadBytes], -0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[4].PendingSum.Loads[statistics.RegionReadKeys], 0.5*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[4].PendingSum.Loads[statistics.RegionReadBytes], 0.5*MB), IsTrue)
 
 	addRegionInfo(tc, write, []testRegionInfo{
 		{2, []uint64{1, 2, 3}, 0.7 * MB, 0.7 * MB},
@@ -1399,16 +1399,16 @@ func (s *testInfluenceSerialSuite) TestInfluenceByRWType(c *C) {
 	op = hb.Schedule(tc)[0]
 	c.Assert(op, NotNil)
 	hb.(*hotScheduler).summaryPendingInfluence()
-	pendingInfluence = hb.(*hotScheduler).pendingSums
+	stInfos = hb.(*hotScheduler).stInfos
 	// assert read/write influence is the sum of write peer and write leader
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionWriteKeys], -1.2*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionWriteBytes], -1.2*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[3].Loads[statistics.RegionWriteKeys], 0.7*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[3].Loads[statistics.RegionWriteBytes], 0.7*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionReadKeys], -1.2*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[1].Loads[statistics.RegionReadBytes], -1.2*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[3].Loads[statistics.RegionReadKeys], 0.7*MB), IsTrue)
-	c.Assert(nearlyAbout(pendingInfluence[3].Loads[statistics.RegionReadBytes], 0.7*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionWriteKeys], -1.2*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionWriteBytes], -1.2*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[3].PendingSum.Loads[statistics.RegionWriteKeys], 0.7*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[3].PendingSum.Loads[statistics.RegionWriteBytes], 0.7*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionReadKeys], -1.2*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[1].PendingSum.Loads[statistics.RegionReadBytes], -1.2*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[3].PendingSum.Loads[statistics.RegionReadKeys], 0.7*MB), IsTrue)
+	c.Assert(nearlyAbout(stInfos[3].PendingSum.Loads[statistics.RegionReadBytes], 0.7*MB), IsTrue)
 }
 
 func nearlyAbout(f1, f2 float64) bool {
diff --git a/server/schedulers/shuffle_hot_region.go b/server/schedulers/shuffle_hot_region.go
index 8d7b749ba029..7f997a6b2964 100644
--- a/server/schedulers/shuffle_hot_region.go
+++ b/server/schedulers/shuffle_hot_region.go
@@ -132,22 +132,20 @@ func (s *shuffleHotRegionScheduler) Schedule(cluster opt.Cluster) []*operator.Op
 }
 
 func (s *shuffleHotRegionScheduler) dispatch(typ rwType, cluster opt.Cluster) []*operator.Operator {
-	stores := cluster.GetStores()
+	storeInfos := summaryStoreInfos(cluster)
 	storesLoads := cluster.GetStoresLoads()
 	switch typ {
 	case read:
 		s.stLoadInfos[readLeader] = summaryStoresLoad(
-			stores,
+			storeInfos,
 			storesLoads,
-			map[uint64]*Influence{},
 			cluster.RegionReadStats(),
 			read, core.LeaderKind)
 		return s.randomSchedule(cluster, s.stLoadInfos[readLeader])
 	case write:
 		s.stLoadInfos[writeLeader] = summaryStoresLoad(
-			stores,
+			storeInfos,
 			storesLoads,
-			map[uint64]*Influence{},
 			cluster.RegionWriteStats(),
 			write, core.LeaderKind)
 		return s.randomSchedule(cluster, s.stLoadInfos[writeLeader])
diff --git a/server/schedulers/utils.go b/server/schedulers/utils.go
index bb9d3a7654dd..e2a76ecd0fe4 100644
--- a/server/schedulers/utils.go
+++ b/server/schedulers/utils.go
@@ -14,6 +14,7 @@
 package schedulers
 
 import (
+	"fmt"
 	"math"
 	"net/url"
 	"strconv"
@@ -207,16 +208,6 @@ type Influence struct {
 	Count float64
 }
 
-func (lhs *Influence) add(rhs *Influence, w float64) *Influence {
-	var infl Influence
-	for i := range lhs.Loads {
-		infl.Loads = append(infl.Loads, lhs.Loads[i]+rhs.Loads[i]*w)
-	}
-	infl.Count = infl.Count + rhs.Count*w
-	return &infl
-}
-
-// TODO: merge it into OperatorInfluence.
 type pendingInfluence struct {
 	op       *operator.Operator
 	from, to uint64
@@ -232,31 +223,7 @@ func newPendingInfluence(op *operator.Operator, from, to uint64, infl Influence)
 	}
 }
 
-// summaryPendingInfluence calculate the summary pending Influence for each store and return storeID -> Influence
-// It makes each key/byte rate or count become (1+w) times to the origin value while f is the function to provide w(weight)
-func summaryPendingInfluence(pendings map[*pendingInfluence]struct{}, f func(*operator.Operator) float64) map[uint64]*Influence {
-	ret := make(map[uint64]*Influence)
-	for p := range pendings {
-		w := f(p.op)
-		if w == 0 {
-			delete(pendings, p)
-		}
-		if _, ok := ret[p.to]; !ok {
-			ret[p.to] = &Influence{Loads: make([]float64, len(p.origin.Loads))}
-		}
-		ret[p.to] = ret[p.to].add(&p.origin, w)
-		if _, ok := ret[p.from]; !ok {
-			ret[p.from] = &Influence{Loads: make([]float64, len(p.origin.Loads))}
-		}
-		ret[p.from] = ret[p.from].add(&p.origin, -w)
-	}
-	return ret
-}
-
-type storeLoad struct {
-	Loads []float64
-	Count float64
-}
+type storeLoad Influence
 
 func (load storeLoad) ToLoadPred(rwTy rwType, infl *Influence) *storeLoadPred {
 	future := storeLoad{
@@ -407,8 +374,44 @@ func maxLoad(a, b *storeLoad) *storeLoad {
 	}
 }
 
+type storeInfo struct {
+	Store      *core.StoreInfo
+	IsTiFlash  bool
+	PendingSum *Influence
+}
+
+func summaryStoreInfos(cluster opt.Cluster) map[uint64]*storeInfo {
+	stores := cluster.GetStores()
+	infos := make(map[uint64]*storeInfo, len(stores))
+	for _, store := range stores {
+		info := &storeInfo{
+			Store:      store,
+			IsTiFlash:  core.IsTiFlashStore(store.GetMeta()),
+			PendingSum: nil,
+		}
+		infos[store.GetID()] = info
+	}
+	return infos
+}
+
+func (s *storeInfo) addInfluence(infl *Influence, w float64) {
+	if infl == nil || w == 0 {
+		return
+	}
+	if s.PendingSum == nil {
+		s.PendingSum = &Influence{
+			Loads: make([]float64, len(infl.Loads)),
+			Count: 0,
+		}
+	}
+	for i, load := range infl.Loads {
+		s.PendingSum.Loads[i] += load * w
+	}
+	s.PendingSum.Count += infl.Count * w
+}
+
 type storeLoadDetail struct {
-	Store    *core.StoreInfo
+	Info     *storeInfo
 	LoadPred *storeLoadPred
 	HotPeers []*statistics.HotPeerStat
 }
@@ -466,3 +469,134 @@ func toHotPeerStatShow(p *statistics.HotPeerStat, kind rwType) statistics.HotPee
 		LastUpdateTime: p.LastUpdateTime,
 	}
 }
+
+// summaryStoresLoad Load information of all available stores.
+// it will filtered the hot peer and calculate the current and future stat(byte/key rate,count) for each store
+func summaryStoresLoad(
+	storeInfos map[uint64]*storeInfo,
+	storesLoads map[uint64][]float64,
+	storeHotPeers map[uint64][]*statistics.HotPeerStat,
+	rwTy rwType,
+	kind core.ResourceKind,
+) map[uint64]*storeLoadDetail {
+	// loadDetail stores the storeID -> hotPeers stat and its current and future stat(key/byte rate,count)
+	loadDetail := make(map[uint64]*storeLoadDetail, len(storesLoads))
+	allLoadSum := make([]float64, statistics.DimLen)
+	allCount := 0.0
+
+	// Stores without byte rate statistics is not available to schedule.
+	for _, info := range storeInfos {
+		store := info.Store
+		id := store.GetID()
+		storeLoads, ok := storesLoads[id]
+		if !ok {
+			continue
+		}
+		isTiFlash := core.IsTiFlashStore(store.GetMeta())
+		loads := make([]float64, statistics.DimLen)
+		switch rwTy {
+		case read:
+			loads[statistics.ByteDim] = storeLoads[statistics.StoreReadBytes]
+			loads[statistics.KeyDim] = storeLoads[statistics.StoreReadKeys]
+		case write:
+			if isTiFlash {
+				loads[statistics.ByteDim] = storeLoads[statistics.StoreRegionsWriteBytes]
+				loads[statistics.KeyDim] = storeLoads[statistics.StoreRegionsWriteKeys]
+			} else {
+				loads[statistics.ByteDim] = storeLoads[statistics.StoreWriteBytes]
+				loads[statistics.KeyDim] = storeLoads[statistics.StoreWriteKeys]
+			}
+		}
+
+		// Find all hot peers first
+		var hotPeers []*statistics.HotPeerStat
+		{
+			peerLoadSum := make([]float64, statistics.DimLen)
+			// TODO: To remove `filterHotPeers`, we need to:
+			// HotLeaders consider `Write{Bytes,Keys}`, so when we schedule `writeLeader`, all peers are leader.
+			for _, peer := range filterHotPeers(kind, storeHotPeers[id]) {
+				for i := range peerLoadSum {
+					peerLoadSum[i] += peer.GetLoad(getRegionStatKind(rwTy, i))
+				}
+				hotPeers = append(hotPeers, peer.Clone())
+			}
+			// Use sum of hot peers to estimate leader-only byte rate.
+			// For write requests, Write{Bytes, Keys} is applied to all Peers at the same time, while the Leader and Follower are under different loads (usually the Leader consumes more CPU).
+			// But none of the current dimension reflect this difference, so we create a new dimension to reflect it.
+			if kind == core.LeaderKind && rwTy == write {
+				loads = peerLoadSum
+			}
+
+			// Metric for debug.
+			{
+				ty := "byte-rate-" + rwTy.String() + "-" + kind.String()
+				hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(peerLoadSum[statistics.ByteDim])
+			}
+			{
+				ty := "key-rate-" + rwTy.String() + "-" + kind.String()
+				hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(peerLoadSum[statistics.KeyDim])
+			}
+		}
+
+		if !isTiFlash {
+			// The TiFlash flow is isolated from TiKV, so it is not counted in the sum.
+			for i := range allLoadSum {
+				allLoadSum[i] += loads[i]
+			}
+		}
+		allCount += float64(len(hotPeers))
+
+		// Build store load prediction from current load and pending influence.
+		stLoadPred := (&storeLoad{
+			Loads: loads,
+			Count: float64(len(hotPeers)),
+		}).ToLoadPred(rwTy, info.PendingSum)
+
+		// Construct store load info.
+		loadDetail[id] = &storeLoadDetail{
+			Info:     info,
+			LoadPred: stLoadPred,
+			HotPeers: hotPeers,
+		}
+	}
+	storeLen := float64(len(storesLoads))
+	// store expectation byte/key rate and count for each store-load detail.
+	for id, detail := range loadDetail {
+		expectLoads := make([]float64, len(allLoadSum))
+		for i := range expectLoads {
+			expectLoads[i] = allLoadSum[i] / storeLen
+		}
+		expectCount := allCount / storeLen
+		detail.LoadPred.Expect.Loads = expectLoads
+		detail.LoadPred.Expect.Count = expectCount
+		// Debug
+		{
+			ty := "exp-byte-rate-" + rwTy.String() + "-" + kind.String()
+			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectLoads[statistics.ByteDim])
+		}
+		{
+			ty := "exp-key-rate-" + rwTy.String() + "-" + kind.String()
+			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectLoads[statistics.KeyDim])
+		}
+		{
+			ty := "exp-count-rate-" + rwTy.String() + "-" + kind.String()
+			hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(expectCount)
+		}
+	}
+	return loadDetail
+}
+
+// filterHotPeers filter the peer whose hot degree is less than minHotDegress
+func filterHotPeers(
+	kind core.ResourceKind,
+	peers []*statistics.HotPeerStat,
+) []*statistics.HotPeerStat {
+	ret := make([]*statistics.HotPeerStat, 0, len(peers))
+	for _, peer := range peers {
+		if kind == core.LeaderKind && !peer.IsLeader() {
+			continue
+		}
+		ret = append(ret, peer)
+	}
+	return ret
+}
diff --git a/tests/pdctl/scheduler/scheduler_test.go b/tests/pdctl/scheduler/scheduler_test.go
index 805cdfa6ce65..629b5c3aa61b 100644
--- a/tests/pdctl/scheduler/scheduler_test.go
+++ b/tests/pdctl/scheduler/scheduler_test.go
@@ -268,6 +268,7 @@ func (s *schedulerTestSuite) TestScheduler(c *C) {
 		"minor-dec-ratio":           0.99,
 		"src-tolerance-ratio":       1.05,
 		"dst-tolerance-ratio":       1.05,
+		"enable-for-tiflash":        true,
 	}
 	c.Assert(conf, DeepEquals, expected1)
 	mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "src-tolerance-ratio", "1.02"}, nil)