Skip to content

Commit

Permalink
fix: can't recover after a upgrade failed (#120)
Browse files Browse the repository at this point in the history
* tidb graceful upgrade

* fix upgrade state sync bug
  • Loading branch information
xiaojingchen authored and weekface committed Oct 19, 2018
1 parent df63a23 commit 71b0ecd
Show file tree
Hide file tree
Showing 14 changed files with 301 additions and 99 deletions.
2 changes: 1 addition & 1 deletion pkg/controller/tidbcluster/tidb_cluster_control_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func newFakeTidbClusterControl() (ControlInterface, *controller.FakeStatefulSetC

pdMemberManager := mm.NewPDMemberManager(pdControl, setControl, svcControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), podControl, pvcInformer.Lister(), pdScaler, pdUpgrader, autoFailover, pdFailover)
tikvMemberManager := mm.NewTiKVMemberManager(pdControl, setControl, svcControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), nodeInformer.Lister(), autoFailover, tikvFailover, tikvScaler, tikvUpgrader)
tidbMemberManager := mm.NewTiDBMemberManager(setControl, svcControl, tidbControl, setInformer.Lister(), svcInformer.Lister(), tidbUpgrader, autoFailover, tidbFailover)
tidbMemberManager := mm.NewTiDBMemberManager(setControl, svcControl, tidbControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), tidbUpgrader, autoFailover, tidbFailover)
reclaimPolicyManager := meta.NewReclaimPolicyManager(pvcInformer.Lister(), pvInformer.Lister(), pvControl)
metaManager := meta.NewMetaManager(pvcInformer.Lister(), pvcControl, pvInformer.Lister(), pvControl, podInformer.Lister(), podControl)
control := NewDefaultTidbClusterControl(tcControl, pdMemberManager, tikvMemberManager, tidbMemberManager, reclaimPolicyManager, metaManager, recorder)
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/tidbcluster/tidb_cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func NewController(
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod)
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
tidbUpgrader := mm.NewTiDBUpgrader(tidbControl)
tidbUpgrader := mm.NewTiDBUpgrader(tidbControl, podInformer.Lister())

tcc := &Controller{
kubeClient: kubeCli,
Expand Down Expand Up @@ -144,6 +144,7 @@ func NewController(
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
autoFailover,
tidbFailover,
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/tidbcluster/tidb_cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer)
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
autoFailover,
tidbFailover,
Expand Down
32 changes: 30 additions & 2 deletions pkg/manager/member/pd_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,11 @@ func (pmm *pdMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, set

tc.Status.PD.StatefulSet = &set.Status

if statefulSetIsUpgrading(set) {
upgrading, err := pmm.pdStatefulSetIsUpgrading(set, tc)
if err != nil {
return err
}
if upgrading {
tc.Status.PD.Phase = v1alpha1.UpgradePhase
} else {
tc.Status.PD.Phase = v1alpha1.NormalPhase
Expand All @@ -287,7 +291,7 @@ func (pmm *pdMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, set
}
name := memberHealth.Name
if len(name) == 0 {
glog.Warningf("PD member: [%d] don't have a name, and can't get it from clientUrls: [%s], memberHealth Info: [%v] in [%s/%s]",
glog.Warningf("PD member: [%d] doesn't have a name, and can't get it from clientUrls: [%s], memberHealth Info: [%v] in [%s/%s]",
id, memberHealth.ClientUrls, memberHealth, ns, tcName)
continue
}
Expand Down Expand Up @@ -408,6 +412,30 @@ func (pmm *pdMemberManager) getNewPDHeadlessServiceForTidbCluster(tc *v1alpha1.T
}
}

func (pmm *pdMemberManager) pdStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) {
if statefulSetIsUpgrading(set) {
return true, nil
}
selector, err := label.New().Cluster(tc.GetName()).PD().Selector()
if err != nil {
return false, err
}
pdPods, err := pmm.podLister.Pods(tc.GetNamespace()).List(selector)
if err != nil {
return false, err
}
for _, pod := range pdPods {
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return false, nil
}
if revisionHash != tc.Status.PD.StatefulSet.UpdateRevision {
return true, nil
}
}
return false, nil
}

func (pmm *pdMemberManager) getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster) (*apps.StatefulSet, error) {
ns := tc.Namespace
tcName := tc.Name
Expand Down
68 changes: 42 additions & 26 deletions pkg/manager/member/pd_upgrader.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,38 +66,31 @@ func (pu *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Sta
}

tc.Status.PD.Phase = v1alpha1.UpgradePhase
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)

if tc.Status.PD.StatefulSet.CurrentReplicas == 0 {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd doesn't have old version pod to upgrade", ns, tcName)
if !templateEqual(newSet.Spec.Template, oldSet.Spec.Template) {
return nil
}

if !tc.PDAllPodsStarted() {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pods are not all created", ns, tcName)
}

for i := tc.Status.PD.StatefulSet.Replicas; i > tc.Status.PD.StatefulSet.CurrentReplicas; i-- {
if member, exist := tc.Status.PD.Members[pdPodName(tcName, i-1)]; !exist || !member.Health {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pods are not all ready", ns, tcName)
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)
for i := tc.Status.PD.StatefulSet.Replicas - 1; i >= 0; i-- {
podName := pdPodName(tcName, i)
pod, err := pu.podLister.Pods(ns).Get(podName)
if err != nil {
return err
}
}

ordinal := tc.Status.PD.StatefulSet.CurrentReplicas - 1
upgradePodName := pdPodName(tcName, ordinal)
if tc.Status.PD.Leader.Name == upgradePodName {
var targetName string
if ordinal == *newSet.Spec.Replicas-1 {
targetName = pdPodName(tcName, 0)
} else {
targetName = pdPodName(tcName, *newSet.Spec.Replicas-1)
revision, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pod: [%s] has no label: %s", ns, tcName, podName, apps.ControllerRevisionHashLabelKey)
}
err := pu.transferPDLeaderTo(tc, targetName)
if err != nil {
return err

if revision == tc.Status.PD.StatefulSet.UpdateRevision {
if member, exist := tc.Status.PD.Members[podName]; !exist || !member.Health {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pod: [%s] is not ready", ns, tcName, podName)
}
continue
}
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName)
} else {
setUpgradePartition(newSet, ordinal)

return pu.upgradePDPod(tc, i, newSet)
}

return nil
Expand Down Expand Up @@ -131,6 +124,29 @@ func (pu *pdUpgrader) needForceUpgrade(tc *v1alpha1.TidbCluster) (bool, error) {
return imagePullFailedCount >= int(tc.Status.PD.StatefulSet.Replicas)/2+1, nil
}

func (pu *pdUpgrader) upgradePDPod(tc *v1alpha1.TidbCluster, ordinal int32, newSet *apps.StatefulSet) error {
ns := tc.GetNamespace()
tcName := tc.GetName()
upgradePodName := pdPodName(tcName, ordinal)
if tc.Status.PD.Leader.Name == upgradePodName && tc.Status.PD.StatefulSet.Replicas > 1 {
lastOrdinal := tc.Status.PD.StatefulSet.Replicas - 1
var targetName string
if ordinal == lastOrdinal {
targetName = pdPodName(tcName, 0)
} else {
targetName = pdPodName(tcName, lastOrdinal)
}
err := pu.transferPDLeaderTo(tc, targetName)
if err != nil {
return err
}
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName)
}

setUpgradePartition(newSet, ordinal)
return nil
}

func (pu *pdUpgrader) transferPDLeaderTo(tc *v1alpha1.TidbCluster, targetName string) error {
return pu.pdControl.GetPDClient(tc).TransferPDLeader(targetName)
}
Expand Down
31 changes: 27 additions & 4 deletions pkg/manager/member/pd_upgrader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func TestPDUpgraderUpgrade(t *testing.T) {
name string
changeFn func(*v1alpha1.TidbCluster)
changePods func(pods []*corev1.Pod)
changeOldSet func(set *apps.StatefulSet)
transferLeaderErr bool
errExpectFn func(*GomegaWithT, error)
expectFn func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet)
Expand Down Expand Up @@ -71,9 +72,13 @@ func TestPDUpgraderUpgrade(t *testing.T) {
podInformer.Informer().GetIndexer().Add(pods[i])
}

oldSet := newStatefulSetForPDUpgrader()
newSet := oldSet.DeepCopy()
newSet.Spec.Template.Spec.Containers[0].Image = "pd-test-images:v2"
newSet := newStatefulSetForPDUpgrader()
oldSet := newSet.DeepCopy()
if test.changeOldSet != nil {
test.changeOldSet(oldSet)
}
SetLastAppliedConfigAnnotation(oldSet)

newSet.Spec.UpdateStrategy.RollingUpdate.Partition = func() *int32 { i := int32(3); return &i }()

err := upgrader.Upgrade(tc, oldSet, newSet)
Expand All @@ -97,6 +102,24 @@ func TestPDUpgraderUpgrade(t *testing.T) {
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(1); return &i }()))
},
},
{
name: "newSet template changed",
changeFn: func(tc *v1alpha1.TidbCluster) {
tc.Status.PD.Synced = true
},
changePods: nil,
changeOldSet: func(set *apps.StatefulSet) {
set.Spec.Template.Spec.Containers[0].Image = "pd-test-image:old"
},
transferLeaderErr: false,
errExpectFn: func(g *GomegaWithT, err error) {
g.Expect(err).NotTo(HaveOccurred())
},
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) {
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase))
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(3); return &i }()))
},
},
{
name: "skip to wait all members health",
changeFn: func(tc *v1alpha1.TidbCluster) {
Expand All @@ -106,7 +129,7 @@ func TestPDUpgraderUpgrade(t *testing.T) {
changePods: nil,
transferLeaderErr: false,
errExpectFn: func(g *GomegaWithT, err error) {
g.Expect(err.Error()).To(Equal("tidbcluster: [default/upgrader]'s pd upgraded pods are not all ready"))
g.Expect(err.Error()).To(Equal(fmt.Sprintf("tidbcluster: [default/upgrader]'s pd upgraded pod: [%s] is not ready", pdPodName(upgradeTcName, 2))))
},
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) {
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase))
Expand Down
33 changes: 32 additions & 1 deletion pkg/manager/member/tidb_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type tidbMemberManager struct {
tidbControl controller.TiDBControlInterface
setLister v1beta1.StatefulSetLister
svcLister corelisters.ServiceLister
podLister corelisters.PodLister
tidbUpgrader Upgrader
autoFailover bool
tidbFailover Failover
Expand All @@ -47,6 +48,7 @@ func NewTiDBMemberManager(setControl controller.StatefulSetControlInterface,
tidbControl controller.TiDBControlInterface,
setLister v1beta1.StatefulSetLister,
svcLister corelisters.ServiceLister,
podLister corelisters.PodLister,
tidbUpgrader Upgrader,
autoFailover bool,
tidbFailover Failover) manager.Manager {
Expand All @@ -56,6 +58,7 @@ func NewTiDBMemberManager(setControl controller.StatefulSetControlInterface,
tidbControl: tidbControl,
setLister: setLister,
svcLister: svcLister,
podLister: podLister,
tidbUpgrader: tidbUpgrader,
autoFailover: autoFailover,
tidbFailover: tidbFailover,
Expand Down Expand Up @@ -334,7 +337,11 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust
func (tmm *tidbMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, set *apps.StatefulSet) error {
tc.Status.TiDB.StatefulSet = &set.Status

if statefulSetIsUpgrading(set) {
upgrading, err := tmm.tidbStatefulSetIsUpgrading(set, tc)
if err != nil {
return err
}
if upgrading {
tc.Status.TiDB.Phase = v1alpha1.UpgradePhase
} else {
tc.Status.TiDB.Phase = v1alpha1.NormalPhase
Expand All @@ -360,3 +367,27 @@ func (tmm *tidbMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, se

return nil
}

func (tmm *tidbMemberManager) tidbStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) {
if statefulSetIsUpgrading(set) {
return true, nil
}
selector, err := label.New().Cluster(tc.GetName()).TiDB().Selector()
if err != nil {
return false, err
}
tidbPods, err := tmm.podLister.Pods(tc.GetNamespace()).List(selector)
if err != nil {
return false, err
}
for _, pod := range tidbPods {
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return false, nil
}
if revisionHash != tc.Status.TiDB.StatefulSet.UpdateRevision {
return true, nil
}
}
return false, nil
}
2 changes: 2 additions & 0 deletions pkg/manager/member/tidb_member_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func newFakeTiDBMemberManager() (*tidbMemberManager, *controller.FakeStatefulSet
setInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Apps().V1beta1().StatefulSets()
tcInformer := informers.NewSharedInformerFactory(cli, 0).Pingcap().V1alpha1().TidbClusters()
svcInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Core().V1().Services()
podInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Core().V1().Pods()
setControl := controller.NewFakeStatefulSetControl(setInformer, tcInformer)
svcControl := controller.NewFakeServiceControl(svcInformer, tcInformer)
tidbUpgrader := NewFakeTiDBUpgrader()
Expand All @@ -206,6 +207,7 @@ func newFakeTiDBMemberManager() (*tidbMemberManager, *controller.FakeStatefulSet
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
true,
tidbFailover,
Expand Down
Loading

0 comments on commit 71b0ecd

Please sign in to comment.