From e9c31e808e7b0986f6eaf81f2e57c0b3bfe90677 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 21 Jun 2019 20:03:55 +0800 Subject: [PATCH 01/25] fix evict schedulers have not been deleted --- pkg/manager/member/tikv_upgrader.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index a4b8164521b..e609caec554 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -182,6 +182,12 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) if err != nil { return err } + + err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) + if err != nil { + return err + } + _, evicting := upgradedPod.Annotations[EvictLeaderBeginTime] if evicting { delete(upgradedPod.Annotations, EvictLeaderBeginTime) @@ -190,10 +196,6 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) return err } } - err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) - if err != nil { - return err - } return nil } From facc381d970db07c52e78fd17bdf138892165938 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 27 Jun 2019 13:41:49 +0800 Subject: [PATCH 02/25] aaaa --- pkg/manager/member/tikv_upgrader.go | 16 +++--- tests/actions.go | 67 ++++++++++++++++++++++ tests/cmd/stability/main.go | 48 +++++++++------- tests/pkg/webhook/pods.go | 86 ++++++++++++++--------------- 4 files changed, 146 insertions(+), 71 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index e609caec554..282d6277db3 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -120,7 +120,6 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, if err != nil { return err } - _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if tku.readyToUpgrade(upgradePod, store) { err := tku.endEvictLeader(tc, ordinal) @@ -131,7 +130,9 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, return nil } + _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { + glog.Infof("@@@@@@@@@@@@@@@@@ %s", storeID) return tku.beginEvictLeader(tc, storeID, upgradePod) } return controller.RequeueErrorf("tidbcluster: [%s/%s]'s tikv pod: [%s] is evicting leader", ns, tcName, upgradePodName) @@ -159,19 +160,19 @@ func (tku *tikvUpgrader) readyToUpgrade(upgradePod *corev1.Pod, store v1alpha1.T } func (tku *tikvUpgrader) beginEvictLeader(tc *v1alpha1.TidbCluster, storeID uint64, pod *corev1.Pod) error { - err := tku.pdControl.GetPDClient(tc).BeginEvictLeader(storeID) - if err != nil { - return err - } if pod.Annotations == nil { pod.Annotations = map[string]string{} } pod.Annotations[EvictLeaderBeginTime] = time.Now().Format(time.RFC3339) - _, err = tku.podControl.UpdatePod(tc, pod) - return err + _, err := tku.podControl.UpdatePod(tc, pod) + if err != nil { + return err + } + return tku.pdControl.GetPDClient(tc).BeginEvictLeader(storeID) } func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) error { + time.Sleep(30 * time.Second) store := tku.getStoreByOrdinal(tc, ordinal) storeID, err := strconv.ParseUint(store.ID, 10, 64) if err != nil { @@ -187,6 +188,7 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) if err != nil { return err } + glog.Infof("ppppppppppppppppppp%s", storeID) _, evicting := upgradedPod.Annotations[EvictLeaderBeginTime] if evicting { diff --git a/tests/actions.go b/tests/actions.go index 303f0d138bd..40c271ec7f5 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -14,6 +14,7 @@ package tests import ( + "context" "crypto/tls" "database/sql" "encoding/json" @@ -127,6 +128,8 @@ type OperatorActions interface { ScaleTidbClusterOrDie(info *TidbClusterConfig) CheckScaleInSafely(info *TidbClusterConfig) error CheckScaledCorrectly(info *TidbClusterConfig, podUIDsBeforeScale map[string]types.UID) error + CheckUpgradeOrDie(ctx context.Context, info *TidbClusterConfig) + CheckUpgrade(ctx context.Context, info *TidbClusterConfig) error UpgradeTidbCluster(info *TidbClusterConfig) error UpgradeTidbClusterOrDie(info *TidbClusterConfig) DeployAdHocBackup(info *TidbClusterConfig) error @@ -904,6 +907,70 @@ func (oa *operatorActions) UpgradeTidbClusterOrDie(info *TidbClusterConfig) { } } +func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterConfig) error { + ns := info.Namespace + tcName := info.ClusterName + + findStoreFn := func(tc *v1alpha1.TidbCluster, podName string) string { + for storeID, store := range tc.Status.TiKV.Stores { + if store.PodName == podName { + return storeID + } + } + + return "" + } + + for { + tc, err := oa.cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("failed to get tidbcluster: %s/%s, %v", ns, tcName, err) + continue + } + pdClient := controller.NewDefaultPDControl().GetPDClient(tc) + + replicas := tc.Spec.TiKV.Replicas + for i := replicas - 1; i > 0; i-- { + if err := wait.PollImmediate(10*time.Millisecond, 5*time.Minute, func() (done bool, err error) { + schedulers, err := pdClient.GetEvictLeaderSchedulers() + if err != nil { + glog.Errorf("failed to get evict leader schedulers, %v", err) + return false, nil + } + if len(schedulers) > 1 { + return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) + } + if len(schedulers) == 0 { + return false, nil + } + podName := fmt.Sprintf("%s-tikv-%d", tcName, i) + glog.Info(schedulers) + glog.Info(podName) + glog.Info(tc.Status.TiKV.Stores) + scheduler := fmt.Sprintf("evict-leader-scheduler-%s", findStoreFn(tc, podName)) + glog.Info(scheduler) + glog.Info(replicas) + if schedulers[0] == scheduler { + return true, nil + } + return true, fmt.Errorf("the scheduler: %s != %s", schedulers[0], scheduler) + }); err != nil { + glog.Errorf("failed to check upgrade %s/%s, %v", ns, tcName, err) + return err + } + } + break + } + + return nil +} + +func (oa *operatorActions) CheckUpgradeOrDie(ctx context.Context, info *TidbClusterConfig) { + if err := oa.CheckUpgrade(ctx, info); err != nil { + slack.NotifyAndPanic(err) + } +} + func (oa *operatorActions) DeployMonitor(info *TidbClusterConfig) error { return nil } func (oa *operatorActions) CleanMonitor(info *TidbClusterConfig) error { return nil } diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 7dcfab11adb..8312f47a0a8 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -14,6 +14,7 @@ package main import ( + "context" "fmt" "net/http" _ "net/http/pprof" @@ -32,7 +33,7 @@ import ( ) var cfg *tests.Config -var context *apimachinery.CertContext +var certCtx *apimachinery.CertContext var upgradeVersions []string func main() { @@ -46,11 +47,11 @@ func main() { ns := os.Getenv("NAMESPACE") var err error - context, err = apimachinery.SetupServerCert(ns, tests.WebhookServiceName) + certCtx, err = apimachinery.SetupServerCert(ns, tests.WebhookServiceName) if err != nil { panic(err) } - go tests.StartValidatingAdmissionWebhookServerOrDie(context) + go tests.StartValidatingAdmissionWebhookServerOrDie(certCtx) c := cron.New() if err := c.AddFunc("0 0 10 * * *", func() { @@ -137,38 +138,43 @@ func run() { go oa.BeginInsertDataToOrDie(cluster) } - // scale out - for _, cluster := range clusters { - cluster.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) - oa.ScaleTidbClusterOrDie(cluster) - } - for _, cluster := range clusters { - oa.CheckTidbClusterStatusOrDie(cluster) - oa.CheckDisasterToleranceOrDie(cluster) - } + if false { + // scale out + for _, cluster := range clusters { + cluster.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) + oa.ScaleTidbClusterOrDie(cluster) + } + for _, cluster := range clusters { + oa.CheckTidbClusterStatusOrDie(cluster) + oa.CheckDisasterToleranceOrDie(cluster) + } - // scale in - for _, cluster := range clusters { - cluster.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) - oa.ScaleTidbClusterOrDie(cluster) - } - for _, cluster := range clusters { - oa.CheckTidbClusterStatusOrDie(cluster) - oa.CheckDisasterToleranceOrDie(cluster) + // scale in + for _, cluster := range clusters { + cluster.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) + oa.ScaleTidbClusterOrDie(cluster) + } + for _, cluster := range clusters { + oa.CheckTidbClusterStatusOrDie(cluster) + oa.CheckDisasterToleranceOrDie(cluster) + } } // upgrade - oa.RegisterWebHookAndServiceOrDie(context, ocfg) + oa.RegisterWebHookAndServiceOrDie(certCtx, ocfg) + ctx, cancel := context.WithCancel(context.Background()) for idx, cluster := range clusters { assignedNodes := oa.GetTidbMemberAssignedNodesOrDie(cluster) cluster.UpgradeAll(upgradeVersion) oa.UpgradeTidbClusterOrDie(cluster) + oa.CheckUpgradeOrDie(ctx, cluster) if idx == 0 { oa.CheckManualPauseTiDBOrDie(cluster) } oa.CheckTidbClusterStatusOrDie(cluster) oa.CheckTidbMemberAssignedNodesOrDie(cluster, assignedNodes) } + cancel() // configuration change for _, cluster := range clusters { diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index 9021947fd9b..c2adc832aef 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -151,49 +151,49 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } glog.Infof("savely delete pod namespace %s name %s leader name %s", namespace, name, leader.Name) - } else if pod.Labels[label.ComponentLabelKey] == "tikv" { - - var storeID uint64 - storeID = 0 - for _, store := range tc.Status.TiKV.Stores { - if store.PodName == name { - storeID, err = strconv.ParseUint(store.ID, 10, 64) - if err != nil { - glog.Errorf("fail to convert string to int while deleting PD err %v", err) - return &reviewResponse - } - break - } - } - - // Fail to get store in stores - if storeID == 0 { - glog.Errorf("fail to find store in TIKV.Stores podname %s", name) - return &reviewResponse - } - - storeInfo, err := pdClient.GetStore(storeID) - if err != nil { - glog.Errorf("fail to read storeID %d response %v", storeID, err) - return &reviewResponse - } - - beforeCount := kvLeaderMap[namespace][name] - afterCount := storeInfo.Status.LeaderCount - - if beforeCount != 0 && !(afterCount < beforeCount) && tc.Status.TiKV.StatefulSet.Replicas > 1 { - time.Sleep(10 * time.Second) - err := fmt.Errorf("failed to evict leader from %s/%s, before: %d, now: %d", - namespace, name, beforeCount, afterCount) - glog.Error(err) - sendErr := slack.SendErrMsg(err.Error()) - if sendErr != nil { - glog.Error(sendErr) - } - // TODO use context instead - os.Exit(3) - } - glog.Infof("savely delete pod namespace %s name %s before count %d after count %d", namespace, name, beforeCount, afterCount) + //} else if pod.Labels[label.ComponentLabelKey] == "tikv" { + + // var storeID uint64 + // storeID = 0 + // for _, store := range tc.Status.TiKV.Stores { + // if store.PodName == name { + // storeID, err = strconv.ParseUint(store.ID, 10, 64) + // if err != nil { + // glog.Errorf("fail to convert string to int while deleting PD err %v", err) + // return &reviewResponse + // } + // break + // } + // } + + // // Fail to get store in stores + // if storeID == 0 { + // glog.Errorf("fail to find store in TIKV.Stores podname %s", name) + // return &reviewResponse + // } + + // storeInfo, err := pdClient.GetStore(storeID) + // if err != nil { + // glog.Errorf("fail to read storeID %d response %v", storeID, err) + // return &reviewResponse + // } + + // beforeCount := kvLeaderMap[namespace][name] + // afterCount := storeInfo.Status.LeaderCount + + // if beforeCount != 0 && !(afterCount < beforeCount) && tc.Status.TiKV.StatefulSet.Replicas > 1 { + // time.Sleep(10 * time.Second) + // err := fmt.Errorf("failed to evict leader from %s/%s, before: %d, now: %d", + // namespace, name, beforeCount, afterCount) + // glog.Error(err) + // sendErr := slack.SendErrMsg(err.Error()) + // if sendErr != nil { + // glog.Error(sendErr) + // } + // // TODO use context instead + // os.Exit(3) + // } + // glog.Infof("savely delete pod namespace %s name %s before count %d after count %d", namespace, name, beforeCount, afterCount) } reviewResponse.Allowed = true return &reviewResponse From 9c379e03cab1fd8a1523ba34d312363a00abf9ff Mon Sep 17 00:00:00 2001 From: weekface Date: Tue, 2 Jul 2019 10:29:55 +0800 Subject: [PATCH 03/25] stash --- pkg/manager/member/tikv_upgrader.go | 10 +++++++--- tests/actions.go | 20 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 282d6277db3..dd757e089d3 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -184,9 +184,13 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) return err } - err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) - if err != nil { - return err + for i := 3; i > 0; i-- { + err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) + if err != nil { + return err + } + glog.Infof("storeID: %d", storeID) + time.Sleep(5 * time.Second) } glog.Infof("ppppppppppppppppppp%s", storeID) diff --git a/tests/actions.go b/tests/actions.go index 40c271ec7f5..c50348e0297 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -931,7 +931,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo replicas := tc.Spec.TiKV.Replicas for i := replicas - 1; i > 0; i-- { - if err := wait.PollImmediate(10*time.Millisecond, 5*time.Minute, func() (done bool, err error) { + if err := wait.PollImmediate(1*time.Second, 6*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { glog.Errorf("failed to get evict leader schedulers, %v", err) @@ -953,12 +953,28 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo if schedulers[0] == scheduler { return true, nil } - return true, fmt.Errorf("the scheduler: %s != %s", schedulers[0], scheduler) + glog.Errorf("the scheduler: %s != %s", schedulers[0], scheduler) + return false, nil }); err != nil { glog.Errorf("failed to check upgrade %s/%s, %v", ns, tcName, err) return err } } + if err := wait.PollImmediate(10*time.Millisecond, 6*time.Minute, func() (done bool, err error) { + schedulers, err := pdClient.GetEvictLeaderSchedulers() + if err != nil { + glog.Errorf("failed to get evict leader schedulers, %v", err) + return false, nil + } + if len(schedulers) == 0 { + return true, nil + } + glog.Errorf("schedulers: %v is not empty") + return false, nil + }); err != nil { + glog.Errorf("failed to wait all schedulers deleted %s/%s, %v", ns, tcName, err) + return err + } break } From f5951ebe107ee5c51497d857b3bee80fbbe5a868 Mon Sep 17 00:00:00 2001 From: weekface Date: Tue, 2 Jul 2019 11:37:56 +0800 Subject: [PATCH 04/25] bb --- tests/actions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index c50348e0297..7692d8cdf04 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -931,7 +931,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo replicas := tc.Spec.TiKV.Replicas for i := replicas - 1; i > 0; i-- { - if err := wait.PollImmediate(1*time.Second, 6*time.Minute, func() (done bool, err error) { + if err := wait.PollImmediate(5*time.Second, 10*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { glog.Errorf("failed to get evict leader schedulers, %v", err) From 41903c84fcd77e8556ad4ace67c3511e42a8b100 Mon Sep 17 00:00:00 2001 From: weekface Date: Tue, 2 Jul 2019 15:31:57 +0800 Subject: [PATCH 05/25] sss --- pkg/manager/member/tikv_upgrader.go | 47 +++++++++++++++++------------ tests/actions.go | 2 +- tests/cmd/stability/main.go | 22 ++++++++++++++ 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index dd757e089d3..4f332fdd369 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -130,6 +130,18 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, return nil } + if ordinal < tc.Spec.TiKV.Replicas-1 { + nextPodName := tikvPodName(tcName, ordinal+1) + nextPod, err := tku.podLister.Pods(ns).Get(nextPodName) + if err != nil { + return err + } + _, nextPodEvicting := nextPod.Annotations[EvictLeaderBeginTime] + if nextPodEvicting { + return controller.RequeueErrorf("waiting for tidbcluster[%s/%s]'s tikv pod: [%s] is evicting leader", ns, tcName, nextPodName) + } + } + _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { glog.Infof("@@@@@@@@@@@@@@@@@ %s", storeID) @@ -178,30 +190,27 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) if err != nil { return err } - upgradedPodName := tikvPodName(tc.GetName(), ordinal) - upgradedPod, err := tku.podLister.Pods(tc.GetNamespace()).Get(upgradedPodName) + // upgradedPodName := tikvPodName(tc.GetName(), ordinal) + // upgradedPod, err := tku.podLister.Pods(tc.GetNamespace()).Get(upgradedPodName) + // if err != nil { + // return err + // } + + err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) if err != nil { return err } - - for i := 3; i > 0; i-- { - err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) - if err != nil { - return err - } - glog.Infof("storeID: %d", storeID) - time.Sleep(5 * time.Second) - } + glog.Infof("storeID: %d", storeID) glog.Infof("ppppppppppppppppppp%s", storeID) - _, evicting := upgradedPod.Annotations[EvictLeaderBeginTime] - if evicting { - delete(upgradedPod.Annotations, EvictLeaderBeginTime) - _, err = tku.podControl.UpdatePod(tc, upgradedPod) - if err != nil { - return err - } - } + // _, evicting := upgradedPod.Annotations[EvictLeaderBeginTime] + // if evicting { + // delete(upgradedPod.Annotations, EvictLeaderBeginTime) + // _, err = tku.podControl.UpdatePod(tc, upgradedPod) + // if err != nil { + // return err + // } + // } return nil } diff --git a/tests/actions.go b/tests/actions.go index 7692d8cdf04..bc02b74cd71 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -960,7 +960,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo return err } } - if err := wait.PollImmediate(10*time.Millisecond, 6*time.Minute, func() (done bool, err error) { + if err := wait.PollImmediate(5*time.Second, 6*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { glog.Errorf("failed to get evict leader schedulers, %v", err) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 8312f47a0a8..fb8ed4a99a1 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -73,6 +73,13 @@ func run() { cluster1 := newTidbClusterConfig("ns1", "cluster1") cluster2 := newTidbClusterConfig("ns2", "cluster2") cluster3 := newTidbClusterConfig("ns2", "cluster3") + cluster4 := newTidbClusterConfig("ns2", "cluster4") + cluster5 := newTidbClusterConfig("ns2", "cluster5") + cluster6 := newTidbClusterConfig("ns2", "cluster6") + cluster7 := newTidbClusterConfig("ns2", "cluster7") + cluster8 := newTidbClusterConfig("ns2", "cluster8") + cluster9 := newTidbClusterConfig("ns2", "cluster9") + cluster10 := newTidbClusterConfig("ns2", "cluster10") restoreCluster1 := newTidbClusterConfig("ns1", "restore1") restoreCluster2 := newTidbClusterConfig("ns2", "restore2") @@ -86,6 +93,13 @@ func run() { cluster1, cluster2, cluster3, + cluster4, + cluster5, + cluster6, + cluster7, + cluster8, + cluster9, + cluster10, restoreCluster1, restoreCluster2, onePDCluster1, @@ -277,6 +291,14 @@ func run() { preUpgrade := []*tests.TidbClusterConfig{ cluster1, cluster2, + cluster3, + cluster4, + cluster5, + cluster6, + cluster7, + cluster8, + cluster9, + cluster10, } caseFn(preUpgrade, onePDCluster1, restoreCluster1, upgradeVersions[0]) From e31fbc482da8ac9e4ad90684f3aa836bbc817ae0 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 11:37:57 +0800 Subject: [PATCH 06/25] fix --- pkg/manager/member/tikv_upgrader.go | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 4f332fdd369..4afeda66734 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -120,6 +120,11 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, if err != nil { return err } + _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] + if !evicting { + glog.Infof("@@@@@@@@@@@@@@@@@ %s", storeID) + return tku.beginEvictLeader(tc, storeID, upgradePod) + } if tku.readyToUpgrade(upgradePod, store) { err := tku.endEvictLeader(tc, ordinal) @@ -130,23 +135,6 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, return nil } - if ordinal < tc.Spec.TiKV.Replicas-1 { - nextPodName := tikvPodName(tcName, ordinal+1) - nextPod, err := tku.podLister.Pods(ns).Get(nextPodName) - if err != nil { - return err - } - _, nextPodEvicting := nextPod.Annotations[EvictLeaderBeginTime] - if nextPodEvicting { - return controller.RequeueErrorf("waiting for tidbcluster[%s/%s]'s tikv pod: [%s] is evicting leader", ns, tcName, nextPodName) - } - } - - _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] - if !evicting { - glog.Infof("@@@@@@@@@@@@@@@@@ %s", storeID) - return tku.beginEvictLeader(tc, storeID, upgradePod) - } return controller.RequeueErrorf("tidbcluster: [%s/%s]'s tikv pod: [%s] is evicting leader", ns, tcName, upgradePodName) } } From 8fc19a17f7926fbc5bfc5cd406814c4eeb4830b1 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 15:16:01 +0800 Subject: [PATCH 07/25] change main --- tests/cmd/stability/main.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index fb8ed4a99a1..5fcbf62a3dc 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -77,9 +77,6 @@ func run() { cluster5 := newTidbClusterConfig("ns2", "cluster5") cluster6 := newTidbClusterConfig("ns2", "cluster6") cluster7 := newTidbClusterConfig("ns2", "cluster7") - cluster8 := newTidbClusterConfig("ns2", "cluster8") - cluster9 := newTidbClusterConfig("ns2", "cluster9") - cluster10 := newTidbClusterConfig("ns2", "cluster10") restoreCluster1 := newTidbClusterConfig("ns1", "restore1") restoreCluster2 := newTidbClusterConfig("ns2", "restore2") @@ -97,9 +94,6 @@ func run() { cluster5, cluster6, cluster7, - cluster8, - cluster9, - cluster10, restoreCluster1, restoreCluster2, onePDCluster1, @@ -296,9 +290,6 @@ func run() { cluster5, cluster6, cluster7, - cluster8, - cluster9, - cluster10, } caseFn(preUpgrade, onePDCluster1, restoreCluster1, upgradeVersions[0]) From 9b9e0015c171102df51a6e1d2c86784c2cb87d35 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 17:18:50 +0800 Subject: [PATCH 08/25] remove useless log --- pkg/manager/member/tikv_upgrader.go | 28 +++++------------------- pkg/manager/member/tikv_upgrader_test.go | 24 +++++++++++--------- tests/actions.go | 5 ----- 3 files changed, 20 insertions(+), 37 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 4afeda66734..927668cf84d 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -122,7 +122,6 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, } _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { - glog.Infof("@@@@@@@@@@@@@@@@@ %s", storeID) return tku.beginEvictLeader(tc, storeID, upgradePod) } @@ -160,15 +159,16 @@ func (tku *tikvUpgrader) readyToUpgrade(upgradePod *corev1.Pod, store v1alpha1.T } func (tku *tikvUpgrader) beginEvictLeader(tc *v1alpha1.TidbCluster, storeID uint64, pod *corev1.Pod) error { + err := tku.pdControl.GetPDClient(tc).BeginEvictLeader(storeID) + if err != nil { + return err + } if pod.Annotations == nil { pod.Annotations = map[string]string{} } pod.Annotations[EvictLeaderBeginTime] = time.Now().Format(time.RFC3339) - _, err := tku.podControl.UpdatePod(tc, pod) - if err != nil { - return err - } - return tku.pdControl.GetPDClient(tc).BeginEvictLeader(storeID) + _, err = tku.podControl.UpdatePod(tc, pod) + return err } func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) error { @@ -178,27 +178,11 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) if err != nil { return err } - // upgradedPodName := tikvPodName(tc.GetName(), ordinal) - // upgradedPod, err := tku.podLister.Pods(tc.GetNamespace()).Get(upgradedPodName) - // if err != nil { - // return err - // } err = tku.pdControl.GetPDClient(tc).EndEvictLeader(storeID) if err != nil { return err } - glog.Infof("storeID: %d", storeID) - glog.Infof("ppppppppppppppppppp%s", storeID) - - // _, evicting := upgradedPod.Annotations[EvictLeaderBeginTime] - // if evicting { - // delete(upgradedPod.Annotations, EvictLeaderBeginTime) - // _, err = tku.podControl.UpdatePod(tc, upgradedPod) - // if err != nil { - // return err - // } - // } return nil } diff --git a/pkg/manager/member/tikv_upgrader_test.go b/pkg/manager/member/tikv_upgrader_test.go index c47af04a894..4e29ef62d5d 100644 --- a/pkg/manager/member/tikv_upgrader_test.go +++ b/pkg/manager/member/tikv_upgrader_test.go @@ -128,7 +128,13 @@ func TestTiKVUpgraderUpgrade(t *testing.T) { changeOldSet: func(oldSet *apps.StatefulSet) { SetLastAppliedConfigAnnotation(oldSet) }, - changePods: nil, + changePods: func(pods []*corev1.Pod) { + for _, pod := range pods { + if pod.GetName() == tikvPodName(upgradeTcName, 2) { + pod.Annotations = map[string]string{EvictLeaderBeginTime: time.Now().Add(-1 * time.Minute).Format(time.RFC3339)} + } + } + }, beginEvictLeaderErr: false, endEvictLeaderErr: false, updatePodErr: false, @@ -138,10 +144,6 @@ func TestTiKVUpgraderUpgrade(t *testing.T) { expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet, pods map[string]*corev1.Pod) { g.Expect(tc.Status.TiKV.Phase).To(Equal(v1alpha1.UpgradePhase)) g.Expect(*newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(int32(2))) - if pods[tikvPodName(upgradeTcName, 2)].Annotations != nil { - _, exist := pods[tikvPodName(upgradeTcName, 2)].Annotations[EvictLeaderBeginTime] - g.Expect(exist).To(BeFalse()) - } }, }, { @@ -163,7 +165,13 @@ func TestTiKVUpgraderUpgrade(t *testing.T) { oldSet.Status.UpdatedReplicas = 1 oldSet.Spec.UpdateStrategy.RollingUpdate.Partition = func() *int32 { i := int32(2); return &i }() }, - changePods: nil, + changePods: func(pods []*corev1.Pod) { + for _, pod := range pods { + if pod.GetName() == tikvPodName(upgradeTcName, 1) { + pod.Annotations = map[string]string{EvictLeaderBeginTime: time.Now().Add(-1 * time.Minute).Format(time.RFC3339)} + } + } + }, beginEvictLeaderErr: false, endEvictLeaderErr: false, updatePodErr: false, @@ -172,10 +180,6 @@ func TestTiKVUpgraderUpgrade(t *testing.T) { }, expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet, pods map[string]*corev1.Pod) { g.Expect(*newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(int32(1))) - if pods[tikvPodName(upgradeTcName, 1)].Annotations != nil { - _, exist := pods[tikvPodName(upgradeTcName, 1)].Annotations[EvictLeaderBeginTime] - g.Expect(exist).To(BeFalse()) - } }, }, { diff --git a/tests/actions.go b/tests/actions.go index bc02b74cd71..56d5f1f18e3 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -944,12 +944,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo return false, nil } podName := fmt.Sprintf("%s-tikv-%d", tcName, i) - glog.Info(schedulers) - glog.Info(podName) - glog.Info(tc.Status.TiKV.Stores) scheduler := fmt.Sprintf("evict-leader-scheduler-%s", findStoreFn(tc, podName)) - glog.Info(scheduler) - glog.Info(replicas) if schedulers[0] == scheduler { return true, nil } From 44b5f047f31c77090b97635bcdbb9bbacefc5230 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 19:12:57 +0800 Subject: [PATCH 09/25] remove useless code --- tests/actions.go | 5 --- tests/cmd/stability/main.go | 48 ++++++++-------------- tests/pkg/webhook/pods.go | 80 ------------------------------------- 3 files changed, 17 insertions(+), 116 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 56d5f1f18e3..29e79e1a736 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -885,11 +885,6 @@ func (oa *operatorActions) SetPartitionAnnotation(tcName string, nameSpace strin } func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterConfig) error { - // record tikv leader count in webhook first - err := webhook.GetAllKVLeaders(oa.cli, info.Namespace, info.ClusterName) - if err != nil { - return err - } oa.EmitEvent(info, "UpgradeTidbCluster") cmd := oa.getHelmUpgradeClusterCmd(info, nil) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 5fcbf62a3dc..2bfd9c82fbd 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -73,10 +73,6 @@ func run() { cluster1 := newTidbClusterConfig("ns1", "cluster1") cluster2 := newTidbClusterConfig("ns2", "cluster2") cluster3 := newTidbClusterConfig("ns2", "cluster3") - cluster4 := newTidbClusterConfig("ns2", "cluster4") - cluster5 := newTidbClusterConfig("ns2", "cluster5") - cluster6 := newTidbClusterConfig("ns2", "cluster6") - cluster7 := newTidbClusterConfig("ns2", "cluster7") restoreCluster1 := newTidbClusterConfig("ns1", "restore1") restoreCluster2 := newTidbClusterConfig("ns2", "restore2") @@ -90,10 +86,6 @@ func run() { cluster1, cluster2, cluster3, - cluster4, - cluster5, - cluster6, - cluster7, restoreCluster1, restoreCluster2, onePDCluster1, @@ -146,26 +138,24 @@ func run() { go oa.BeginInsertDataToOrDie(cluster) } - if false { - // scale out - for _, cluster := range clusters { - cluster.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) - oa.ScaleTidbClusterOrDie(cluster) - } - for _, cluster := range clusters { - oa.CheckTidbClusterStatusOrDie(cluster) - oa.CheckDisasterToleranceOrDie(cluster) - } + // scale out + for _, cluster := range clusters { + cluster.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) + oa.ScaleTidbClusterOrDie(cluster) + } + for _, cluster := range clusters { + oa.CheckTidbClusterStatusOrDie(cluster) + oa.CheckDisasterToleranceOrDie(cluster) + } - // scale in - for _, cluster := range clusters { - cluster.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) - oa.ScaleTidbClusterOrDie(cluster) - } - for _, cluster := range clusters { - oa.CheckTidbClusterStatusOrDie(cluster) - oa.CheckDisasterToleranceOrDie(cluster) - } + // scale in + for _, cluster := range clusters { + cluster.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) + oa.ScaleTidbClusterOrDie(cluster) + } + for _, cluster := range clusters { + oa.CheckTidbClusterStatusOrDie(cluster) + oa.CheckDisasterToleranceOrDie(cluster) } // upgrade @@ -286,10 +276,6 @@ func run() { cluster1, cluster2, cluster3, - cluster4, - cluster5, - cluster6, - cluster7, } caseFn(preUpgrade, onePDCluster1, restoreCluster1, upgradeVersions[0]) diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index c2adc832aef..8e1a1b272b6 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -12,7 +12,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/golang/glog" - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/label" "github.com/pingcap/tidb-operator/tests/pkg/client" @@ -24,42 +23,6 @@ var ( kvLeaderMap map[string]map[string]int ) -func GetAllKVLeaders(versionCli versioned.Interface, namespace string, clusterName string) error { - - if kvLeaderMap == nil { - kvLeaderMap = make(map[string]map[string]int) - } - - if kvLeaderMap[namespace] == nil { - kvLeaderMap[namespace] = make(map[string]int) - } - - tc, err := versionCli.PingcapV1alpha1().TidbClusters(namespace).Get(clusterName, metav1.GetOptions{}) - - if err != nil { - glog.Infof("fail to get tc clustername %s namesapce %s %v", clusterName, namespace, err) - return err - } - - pdClient := controller.NewDefaultPDControl().GetPDClient(tc) - - for _, store := range tc.Status.TiKV.Stores { - storeID, err := strconv.ParseUint(store.ID, 10, 64) - if err != nil { - glog.Errorf("fail to convert string to int while deleting TIKV err %v", err) - return err - } - storeInfo, err := pdClient.GetStore(storeID) - if err != nil { - glog.Errorf("fail to read response %v", err) - return err - } - kvLeaderMap[namespace][store.PodName] = storeInfo.Status.LeaderCount - } - - return nil -} - // only allow pods to be delete when it is not ddlowner of tidb, not leader of pd and not // master of tikv. func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { @@ -151,49 +114,6 @@ func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { } glog.Infof("savely delete pod namespace %s name %s leader name %s", namespace, name, leader.Name) - //} else if pod.Labels[label.ComponentLabelKey] == "tikv" { - - // var storeID uint64 - // storeID = 0 - // for _, store := range tc.Status.TiKV.Stores { - // if store.PodName == name { - // storeID, err = strconv.ParseUint(store.ID, 10, 64) - // if err != nil { - // glog.Errorf("fail to convert string to int while deleting PD err %v", err) - // return &reviewResponse - // } - // break - // } - // } - - // // Fail to get store in stores - // if storeID == 0 { - // glog.Errorf("fail to find store in TIKV.Stores podname %s", name) - // return &reviewResponse - // } - - // storeInfo, err := pdClient.GetStore(storeID) - // if err != nil { - // glog.Errorf("fail to read storeID %d response %v", storeID, err) - // return &reviewResponse - // } - - // beforeCount := kvLeaderMap[namespace][name] - // afterCount := storeInfo.Status.LeaderCount - - // if beforeCount != 0 && !(afterCount < beforeCount) && tc.Status.TiKV.StatefulSet.Replicas > 1 { - // time.Sleep(10 * time.Second) - // err := fmt.Errorf("failed to evict leader from %s/%s, before: %d, now: %d", - // namespace, name, beforeCount, afterCount) - // glog.Error(err) - // sendErr := slack.SendErrMsg(err.Error()) - // if sendErr != nil { - // glog.Error(sendErr) - // } - // // TODO use context instead - // os.Exit(3) - // } - // glog.Infof("savely delete pod namespace %s name %s before count %d after count %d", namespace, name, beforeCount, afterCount) } reviewResponse.Allowed = true return &reviewResponse From d1a3d19a803f88da0b235c83285bfe133069f18a Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 19:16:36 +0800 Subject: [PATCH 10/25] remove useless code --- tests/cmd/stability/main.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 2bfd9c82fbd..cc22181a326 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -275,7 +275,6 @@ func run() { preUpgrade := []*tests.TidbClusterConfig{ cluster1, cluster2, - cluster3, } caseFn(preUpgrade, onePDCluster1, restoreCluster1, upgradeVersions[0]) From ad3522ace39f74cd6e858e11d72452cf235ddb27 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 19:19:20 +0800 Subject: [PATCH 11/25] remove useless code --- pkg/manager/member/tikv_upgrader.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 927668cf84d..2d4c0ac682a 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -172,7 +172,6 @@ func (tku *tikvUpgrader) beginEvictLeader(tc *v1alpha1.TidbCluster, storeID uint } func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) error { - time.Sleep(30 * time.Second) store := tku.getStoreByOrdinal(tc, ordinal) storeID, err := strconv.ParseUint(store.ID, 10, 64) if err != nil { From d74e56d0a376323b5471b59369fb2033262d390d Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 19:26:36 +0800 Subject: [PATCH 12/25] fix lint error --- tests/actions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 29e79e1a736..316dea289b8 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -959,7 +959,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo if len(schedulers) == 0 { return true, nil } - glog.Errorf("schedulers: %v is not empty") + glog.Errorf("schedulers: %v is not empty", schedulers) return false, nil }); err != nil { glog.Errorf("failed to wait all schedulers deleted %s/%s, %v", ns, tcName, err) From f0a493b3c966aea233f5d00766ca6176adaff48a Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Wed, 3 Jul 2019 19:39:33 +0800 Subject: [PATCH 13/25] address comment --- tests/pkg/webhook/pods.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/pkg/webhook/pods.go b/tests/pkg/webhook/pods.go index be44808f0ed..3c0e657f2ea 100644 --- a/tests/pkg/webhook/pods.go +++ b/tests/pkg/webhook/pods.go @@ -18,11 +18,6 @@ import ( "k8s.io/api/admission/v1beta1" ) -var ( - // Pod name may the same in different namespaces - kvLeaderMap map[string]map[string]int -) - // only allow pods to be delete when it is not ddlowner of tidb, not leader of pd and not // master of tikv. func admitPods(ar v1beta1.AdmissionReview) *v1beta1.AdmissionResponse { From 8c6f54986ce93135e4ca0b17f91d259135ea43af Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 13:42:57 +0800 Subject: [PATCH 14/25] add a lack argument --- tests/cmd/stability/stability.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/cmd/stability/stability.go b/tests/cmd/stability/stability.go index 37ae08a79e5..63e999d6ff3 100644 --- a/tests/cmd/stability/stability.go +++ b/tests/cmd/stability/stability.go @@ -70,6 +70,7 @@ func newTidbClusterConfig(ns, clusterName string) *tests.TidbClusterConfig { TiKVGrpcConcurrency: 4, TiDBTokenLimit: 1000, PDLogLevel: "info", + TopologyKey: topologyKey, SubValues: tests.GetAffinityConfigOrDie(clusterName, ns, topologyKey, []string{topologyKey}), } } From e86fb939c2cefc78166382ba4e05be76329f9fad Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 17:13:33 +0800 Subject: [PATCH 15/25] add test mode --- .../templates/controller-manager-deployment.yaml | 3 +++ cmd/controller-manager/main.go | 1 + pkg/controller/controller_utils.go | 2 ++ pkg/manager/member/tikv_upgrader.go | 4 ++++ tests/actions.go | 6 ++++-- tests/cmd/e2e/main.go | 1 + 6 files changed, 15 insertions(+), 2 deletions(-) diff --git a/charts/tidb-operator/templates/controller-manager-deployment.yaml b/charts/tidb-operator/templates/controller-manager-deployment.yaml index e61bae0a0e4..aa5e7a72d19 100644 --- a/charts/tidb-operator/templates/controller-manager-deployment.yaml +++ b/charts/tidb-operator/templates/controller-manager-deployment.yaml @@ -40,6 +40,9 @@ spec: - -tikv-failover-period={{ .Values.controllerManager.tikvFailoverPeriod | default "5m" }} - -tidb-failover-period={{ .Values.controllerManager.tidbFailoverPeriod | default "5m" }} - -v={{ .Values.controllerManager.logLevel }} + {{- if .Values.testMode }} + - -test-mode={{ .Values.testMode }} + {{- end}} env: - name: NAMESPACE valueFrom: diff --git a/cmd/controller-manager/main.go b/cmd/controller-manager/main.go index 2309c256228..df81c20e32f 100644 --- a/cmd/controller-manager/main.go +++ b/cmd/controller-manager/main.go @@ -62,6 +62,7 @@ func init() { flag.DurationVar(&pdFailoverPeriod, "pd-failover-period", time.Duration(5*time.Minute), "PD failover period default(5m)") flag.DurationVar(&tikvFailoverPeriod, "tikv-failover-period", time.Duration(5*time.Minute), "TiKV failover period default(5m)") flag.DurationVar(&tidbFailoverPeriod, "tidb-failover-period", time.Duration(5*time.Minute), "TiDB failover period") + flag.BoolVar(&controller.TestMode, "test-mode", false, "whether tidb-operator run in test mode") flag.Parse() } diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index ba2b69d5261..03dd812406c 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -31,6 +31,8 @@ var ( DefaultStorageClassName string // ClusterScoped controls whether operator should manage kubernetes cluster wide TiDB clusters ClusterScoped bool + // TestMode defines whether tidb operator run in test mode, test mode is only open when test + TestMode bool ) const ( diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index a86894add6d..5809db4ae90 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -173,6 +173,10 @@ func (tku *tikvUpgrader) beginEvictLeader(tc *v1alpha1.TidbCluster, storeID uint } func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) error { + // wait 5 second before delete evict scheduler,it is for auto test can catch these info + if controller.TestMode { + time.Sleep(5 * time.Second) + } store := tku.getStoreByOrdinal(tc, ordinal) storeID, err := strconv.ParseUint(store.ID, 10, 64) if err != nil { diff --git a/tests/actions.go b/tests/actions.go index 9ec504e30aa..c94c2de5336 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -225,6 +225,7 @@ type OperatorConfig struct { WebhookConfigName string Context *apimachinery.CertContext ImagePullPolicy corev1.PullPolicy + TestMode bool } type TidbClusterConfig struct { @@ -348,6 +349,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { "controllerManager.replicas": "2", "scheduler.replicas": "2", "imagePullPolicy": string(oi.ImagePullPolicy), + "testMode": strconv.FormatBool(oi.TestMode), } if oi.SchedulerTag != "" { set["scheduler.kubeSchedulerImageTag"] = oi.SchedulerTag @@ -935,7 +937,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo replicas := tc.Spec.TiKV.Replicas for i := replicas - 1; i > 0; i-- { - if err := wait.PollImmediate(5*time.Second, 10*time.Minute, func() (done bool, err error) { + if err := wait.PollImmediate(1*time.Second, 10*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { glog.Errorf("failed to get evict leader schedulers, %v", err) @@ -959,7 +961,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo return err } } - if err := wait.PollImmediate(5*time.Second, 6*time.Minute, func() (done bool, err error) { + if err := wait.PollImmediate(1*time.Second, 6*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { glog.Errorf("failed to get evict leader schedulers, %v", err) diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index 07e8f97fa9c..97b3ac3eaad 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -56,6 +56,7 @@ func main() { WebhookSecretName: "webhook-secret", WebhookConfigName: "webhook-config", ImagePullPolicy: v1.PullIfNotPresent, + TestMode: true, } ns := os.Getenv("NAMESPACE") From 41c7a79689156b3145dfb8a6c61efe0980f69c8f Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 17:40:29 +0800 Subject: [PATCH 16/25] fix bug --- tests/cmd/stability/stability.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/cmd/stability/stability.go b/tests/cmd/stability/stability.go index 63e999d6ff3..21be46c38bf 100644 --- a/tests/cmd/stability/stability.go +++ b/tests/cmd/stability/stability.go @@ -22,6 +22,7 @@ func newOperatorConfig() *tests.OperatorConfig { WebhookSecretName: "webhook-secret", WebhookConfigName: "webhook-config", ImagePullPolicy: v1.PullAlways, + TestMode: true, } } From 589f0961840b48dfa7b1c85a193c56cfecffcfe7 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 19:10:15 +0800 Subject: [PATCH 17/25] add log --- tests/actions.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index c94c2de5336..e6e9ed9aed3 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -943,18 +943,21 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo glog.Errorf("failed to get evict leader schedulers, %v", err) return false, nil } + glog.Infof("index:%d,schedulers: %v, error: %v", i, schedulers, err) if len(schedulers) > 1 { return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) } if len(schedulers) == 0 { + glog.Infof("schedulers count is zero,%v", schedulers) return false, nil } podName := fmt.Sprintf("%s-tikv-%d", tcName, i) scheduler := fmt.Sprintf("evict-leader-scheduler-%s", findStoreFn(tc, podName)) if schedulers[0] == scheduler { + glog.Infof("index: %d,the schedulers: %s = %s", i, schedulers[0], scheduler) return true, nil } - glog.Errorf("the scheduler: %s != %s", schedulers[0], scheduler) + glog.Errorf("index: %d,the scheduler: %s != %s", i, schedulers[0], scheduler) return false, nil }); err != nil { glog.Errorf("failed to check upgrade %s/%s, %v", ns, tcName, err) From bb5aab95d979fbab04fc8e2aedd41d065baaf807 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 19:30:13 +0800 Subject: [PATCH 18/25] fix CheckUpgrade logic --- tests/actions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index e6e9ed9aed3..5230587fe94 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -936,7 +936,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo pdClient := pdapi.NewDefaultPDControl().GetPDClient(pdapi.Namespace(tc.GetNamespace()), tc.GetName()) replicas := tc.Spec.TiKV.Replicas - for i := replicas - 1; i > 0; i-- { + for i := replicas - 1; i >= 0; i-- { if err := wait.PollImmediate(1*time.Second, 10*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() if err != nil { From e6e2e4cba0b48c33fe87370b6ee9a2b419caab14 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Thu, 4 Jul 2019 20:06:40 +0800 Subject: [PATCH 19/25] remove useless logs --- tests/actions.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 5230587fe94..26d72c3799c 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -943,12 +943,10 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo glog.Errorf("failed to get evict leader schedulers, %v", err) return false, nil } - glog.Infof("index:%d,schedulers: %v, error: %v", i, schedulers, err) if len(schedulers) > 1 { return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) } if len(schedulers) == 0 { - glog.Infof("schedulers count is zero,%v", schedulers) return false, nil } podName := fmt.Sprintf("%s-tikv-%d", tcName, i) From c4ace86516d6d5d55c66bf387fa58c3931a598ac Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 5 Jul 2019 11:36:20 +0800 Subject: [PATCH 20/25] add logs and add one tidb version --- tests/actions.go | 1 + tests/config.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 26d72c3799c..04c5d7eabe8 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -943,6 +943,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo glog.Errorf("failed to get evict leader schedulers, %v", err) return false, nil } + glog.V(4).Infof("index:%d,schedulers:%v,error:%v", i, schedulers, err) if len(schedulers) > 1 { return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) } diff --git a/tests/config.go b/tests/config.go index 40d0f7ce63d..30031ffc2ab 100644 --- a/tests/config.go +++ b/tests/config.go @@ -78,7 +78,7 @@ func NewConfig() (*Config, error) { flag.StringVar(&cfg.configFile, "config", "", "Config file") flag.StringVar(&cfg.LogDir, "log-dir", "/logDir", "log directory") flag.IntVar(&cfg.FaultTriggerPort, "fault-trigger-port", 23332, "the http port of fault trigger service") - flag.StringVar(&cfg.TidbVersions, "tidb-versions", "v3.0.0-rc.1,v3.0.0-rc.2", "tidb versions") + flag.StringVar(&cfg.TidbVersions, "tidb-versions", "v3.0.0-rc.1,v3.0.0-rc.2,v3.0.0", "tidb versions") flag.StringVar(&cfg.OperatorTag, "operator-tag", "master", "operator tag used to choose charts") flag.StringVar(&cfg.OperatorImage, "operator-image", "pingcap/tidb-operator:latest", "operator image") flag.StringVar(&cfg.UpgradeOperatorTag, "upgrade-operator-tag", "", "upgrade operator tag used to choose charts") From e027be7a3944dc302082ab31c741816b62a16926 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Fri, 5 Jul 2019 14:18:50 +0800 Subject: [PATCH 21/25] fix regions peers in same rack bug --- tests/dt.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/dt.go b/tests/dt.go index f4406b6d029..40eef6a2187 100644 --- a/tests/dt.go +++ b/tests/dt.go @@ -181,6 +181,10 @@ func (oa *operatorActions) CheckDataRegionDisasterTolerance(cluster *TidbCluster // regionRacks is map of rackName and the peerID regionRacks := map[string]uint64{} for _, peer := range region.Peers { + if len(region.Peers) != 3 { + glog.Infof("cluster[%s] region[%d]'s peers not equal 3,[%v]. May be the failover happened", cluster.ClusterName, region.ID, region.Peers) + continue + } storeID := strconv.FormatUint(peer.StoreId, 10) nodeName, err := oa.getNodeByStoreId(storeID, cluster) if err != nil { @@ -189,7 +193,7 @@ func (oa *operatorActions) CheckDataRegionDisasterTolerance(cluster *TidbCluster rackName := rackNodeMap[nodeName] // if the rack have more than one peer of the region, return error if otherID, exist := regionRacks[rackName]; exist { - return fmt.Errorf("region[%d]'s peer: [%d]and[%d] are in same rack:[%s]", region.ID, otherID, peer.Id, rackName) + return fmt.Errorf("cluster[%s] region[%d]'s peer: [%d]and[%d] are in same rack:[%s]", cluster.ClusterName, region.ID, otherID, peer.Id, rackName) } // add a new pair of rack and peer regionRacks[rackName] = peer.Id From 3cbd0ab71f3247d1ad54e19cb3c74842c0c6e6d2 Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Mon, 8 Jul 2019 12:04:25 +0800 Subject: [PATCH 22/25] add debug log --- pkg/manager/member/tikv_upgrader.go | 3 +++ tests/actions.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 5809db4ae90..6b41f3ea06e 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -123,6 +123,7 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, } _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { + glog.Infof("index:%d,upgradePodName:%s,@@@@@@@@@@@@@@@@@ %s", ordinal, upgradePodName, storeID) return tku.beginEvictLeader(tc, storeID, upgradePod) } @@ -188,6 +189,8 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) return err } + glog.Infof("ordinal:%d,ppppppppppppppppppp%s", ordinal, storeID) + return nil } diff --git a/tests/actions.go b/tests/actions.go index 04c5d7eabe8..1206d58ddf1 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -677,7 +677,7 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error ns := info.Namespace tcName := info.ClusterName - if err := wait.Poll(oa.pollInterval, 30*time.Minute, func() (bool, error) { + if err := wait.Poll(oa.pollInterval, 120*time.Minute, func() (bool, error) { var tc *v1alpha1.TidbCluster var err error if tc, err = oa.cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{}); err != nil { @@ -742,7 +742,7 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error return true, nil }); err != nil { glog.Errorf("check tidb cluster status failed: %s", err.Error()) - return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 30 minutes", ns, tcName) + return fmt.Errorf("failed to waiting for tidbcluster %s/%s ready in 120 minutes", ns, tcName) } return nil From f4b14e211c7b938df79c3c12154dd8799370541a Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Mon, 8 Jul 2019 16:11:30 +0800 Subject: [PATCH 23/25] fix CheckUpgrade's replicas error --- tests/actions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 1206d58ddf1..846ca8dce86 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -935,7 +935,7 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo } pdClient := pdapi.NewDefaultPDControl().GetPDClient(pdapi.Namespace(tc.GetNamespace()), tc.GetName()) - replicas := tc.Spec.TiKV.Replicas + replicas := tc.TiKVRealReplicas() for i := replicas - 1; i >= 0; i-- { if err := wait.PollImmediate(1*time.Second, 10*time.Minute, func() (done bool, err error) { schedulers, err := pdClient.GetEvictLeaderSchedulers() From 35888fcd2598029e8a4b9c34b6a020d4e649a24e Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 9 Jul 2019 10:39:26 +0800 Subject: [PATCH 24/25] change log --- pkg/manager/member/tikv_upgrader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 6b41f3ea06e..5d2e858cd80 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -123,7 +123,7 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, } _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { - glog.Infof("index:%d,upgradePodName:%s,@@@@@@@@@@@@@@@@@ %s", ordinal, upgradePodName, storeID) + glog.Infof("start to evict leader:index:%d,upgradePodName:%s,storeID:%s", ordinal, upgradePodName, storeID) return tku.beginEvictLeader(tc, storeID, upgradePod) } @@ -189,7 +189,7 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) return err } - glog.Infof("ordinal:%d,ppppppppppppppppppp%s", ordinal, storeID) + glog.Infof("successed to remove evict leader,ordinal:%d,storeID:%s", ordinal, storeID) return nil } From 2d9e88e7e7b13f304cda751e8aab53a6013d651d Mon Sep 17 00:00:00 2001 From: xiaojingchen Date: Tue, 9 Jul 2019 10:47:48 +0800 Subject: [PATCH 25/25] fix lint --- pkg/manager/member/tikv_upgrader.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/manager/member/tikv_upgrader.go b/pkg/manager/member/tikv_upgrader.go index 5d2e858cd80..e00139e0c36 100644 --- a/pkg/manager/member/tikv_upgrader.go +++ b/pkg/manager/member/tikv_upgrader.go @@ -123,7 +123,7 @@ func (tku *tikvUpgrader) upgradeTiKVPod(tc *v1alpha1.TidbCluster, ordinal int32, } _, evicting := upgradePod.Annotations[EvictLeaderBeginTime] if !evicting { - glog.Infof("start to evict leader:index:%d,upgradePodName:%s,storeID:%s", ordinal, upgradePodName, storeID) + glog.Infof("start to evict leader:index:%d,upgradePodName:%s,storeID:%d", ordinal, upgradePodName, storeID) return tku.beginEvictLeader(tc, storeID, upgradePod) } @@ -189,7 +189,7 @@ func (tku *tikvUpgrader) endEvictLeader(tc *v1alpha1.TidbCluster, ordinal int32) return err } - glog.Infof("successed to remove evict leader,ordinal:%d,storeID:%s", ordinal, storeID) + glog.Infof("successed to remove evict leader,ordinal:%d,storeID:%d", ordinal, storeID) return nil }