Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: can't recover after a upgrade failed #120

Merged
merged 28 commits into from
Oct 19, 2018
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a7de64c
tidb graceful upgrade
xiaojingchen Sep 30, 2018
2cfbde5
Merge branch 'master' into tidb-graceful-upgrade
weekface Oct 11, 2018
2b78c7b
Merge branch 'master' into tidb-graceful-upgrade
xiaojingchen Oct 11, 2018
fc67d9a
Merge branch 'tidb-graceful-upgrade' of https://github.com/xiaojingch…
xiaojingchen Oct 11, 2018
19ac0fa
address comment
xiaojingchen Oct 11, 2018
16b03a3
address comment
xiaojingchen Oct 11, 2018
99f3a37
rewrite upgrade to fix some bugs
xiaojingchen Oct 12, 2018
bf8b5a4
Merge branch 'master' into fix-ugrade
xiaojingchen Oct 13, 2018
5781db6
Merge branch 'master' into fix-ugrade
gregwebs Oct 13, 2018
756691a
Merge branch 'master' into fix-ugrade
xiaojingchen Oct 14, 2018
645f067
Merge branch 'fix-ugrade' of https://github.com/xiaojingchen/tidb-ope…
xiaojingchen Oct 14, 2018
4cf1d11
fix bugs
xiaojingchen Oct 15, 2018
24524f2
address comment
xiaojingchen Oct 15, 2018
8d149b7
Merge branch 'master' into fix-ugrade
tennix Oct 15, 2018
19eddb5
fix upgrade state sync bug
xiaojingchen Oct 16, 2018
3afa97a
Merge branch 'fix-ugrade' of https://github.com/xiaojingchen/tidb-ope…
xiaojingchen Oct 16, 2018
3caece1
Merge branch 'fix-ugrade' of https://github.com/xiaojingchen/tidb-ope…
xiaojingchen Oct 16, 2018
bd16aaf
Merge branch 'fix-ugrade' of https://github.com/xiaojingchen/tidb-ope…
xiaojingchen Oct 16, 2018
2de52da
fix tikv upgrade bug
xiaojingchen Oct 17, 2018
f666f20
Merge branch 'master' into fix-ugrade
weekface Oct 18, 2018
5938988
fix empty point bug
xiaojingchen Oct 18, 2018
7cb3f53
Merge branch 'fix-ugrade' of https://github.com/xiaojingchen/tidb-ope…
xiaojingchen Oct 18, 2018
5429058
address comment
xiaojingchen Oct 18, 2018
8ea29e7
address comment
xiaojingchen Oct 18, 2018
7239470
address comment
xiaojingchen Oct 18, 2018
e0b1163
Merge branch 'master' into fix-ugrade
xiaojingchen Oct 18, 2018
63d333f
Merge branch 'master' into fix-ugrade
tennix Oct 19, 2018
8b29919
Merge branch 'master' into fix-ugrade
weekface Oct 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/controller/tidbcluster/tidb_cluster_control_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func newFakeTidbClusterControl() (ControlInterface, *controller.FakeStatefulSetC

pdMemberManager := mm.NewPDMemberManager(pdControl, setControl, svcControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), podControl, pvcInformer.Lister(), pdScaler, pdUpgrader, autoFailover, pdFailover)
tikvMemberManager := mm.NewTiKVMemberManager(pdControl, setControl, svcControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), nodeInformer.Lister(), autoFailover, tikvFailover, tikvScaler, tikvUpgrader)
tidbMemberManager := mm.NewTiDBMemberManager(setControl, svcControl, tidbControl, setInformer.Lister(), svcInformer.Lister(), tidbUpgrader, autoFailover, tidbFailover)
tidbMemberManager := mm.NewTiDBMemberManager(setControl, svcControl, tidbControl, setInformer.Lister(), svcInformer.Lister(), podInformer.Lister(), tidbUpgrader, autoFailover, tidbFailover)
reclaimPolicyManager := meta.NewReclaimPolicyManager(pvcInformer.Lister(), pvInformer.Lister(), pvControl)
metaManager := meta.NewMetaManager(pvcInformer.Lister(), pvcControl, pvInformer.Lister(), pvControl, podInformer.Lister(), podControl)
control := NewDefaultTidbClusterControl(tcControl, pdMemberManager, tikvMemberManager, tidbMemberManager, reclaimPolicyManager, metaManager, recorder)
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/tidbcluster/tidb_cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func NewController(
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod)
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
tidbUpgrader := mm.NewTiDBUpgrader(tidbControl)
tidbUpgrader := mm.NewTiDBUpgrader(tidbControl, podInformer.Lister())

tcc := &Controller{
kubeClient: kubeCli,
Expand Down Expand Up @@ -144,6 +144,7 @@ func NewController(
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
autoFailover,
tidbFailover,
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/tidbcluster/tidb_cluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ func newFakeTidbClusterController() (*Controller, cache.Indexer, cache.Indexer)
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
autoFailover,
tidbFailover,
Expand Down
30 changes: 29 additions & 1 deletion pkg/manager/member/pd_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,11 @@ func (pmm *pdMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, set

tc.Status.PD.StatefulSet = &set.Status

if statefulSetIsUpgrading(set) {
upgrading, err := pmm.pdStatefulSetIsUpgrading(set, tc)
if err != nil {
return err
}
if upgrading {
tc.Status.PD.Phase = v1alpha1.UpgradePhase
} else {
tc.Status.PD.Phase = v1alpha1.NormalPhase
Expand Down Expand Up @@ -408,6 +412,30 @@ func (pmm *pdMemberManager) getNewPDHeadlessServiceForTidbCluster(tc *v1alpha1.T
}
}

func (pmm *pdMemberManager) pdStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) {
if statefulSetIsUpgrading(set) {
return true, nil
}
selector, err := label.New().Cluster(tc.GetName()).PD().Selector()
if err != nil {
return false, err
}
pdPods, err := pmm.podLister.Pods(tc.GetNamespace()).List(selector)
if err != nil {
return false, err
}
for _, pod := range pdPods {
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return false, nil
}
if revisionHash != tc.Status.PD.StatefulSet.UpdateRevision {
return true, nil
}
}
return false, nil
}

func (pmm *pdMemberManager) getNewPDSetForTidbCluster(tc *v1alpha1.TidbCluster) (*apps.StatefulSet, error) {
ns := tc.Namespace
tcName := tc.Name
Expand Down
92 changes: 66 additions & 26 deletions pkg/manager/member/pd_upgrader.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,38 +66,31 @@ func (pu *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Sta
}

tc.Status.PD.Phase = v1alpha1.UpgradePhase
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)

if tc.Status.PD.StatefulSet.CurrentReplicas == 0 {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd doesn't have old version pod to upgrade", ns, tcName)
}

if !tc.PDAllPodsStarted() {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pods are not all created", ns, tcName)
if !templateEqual(newSet.Spec.Template, oldSet.Spec.Template) {
return nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set Partition to the max

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And add a comment to indicate the reason why return nil.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the Partition default is max

}

for i := tc.Status.PD.StatefulSet.Replicas; i > tc.Status.PD.StatefulSet.CurrentReplicas; i-- {
if member, exist := tc.Status.PD.Members[pdPodName(tcName, i-1)]; !exist || !member.Health {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pods are not all ready", ns, tcName)
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for i := tc.Status.PD.StatefulSet.Replicas - 1; i >= 0; i-- {
podName := pdPodName(tcName, i)
pod, err := pu.podLister.Pods(ns).Get(podName)
if err != nil {
return err
}
}

ordinal := tc.Status.PD.StatefulSet.CurrentReplicas - 1
upgradePodName := pdPodName(tcName, ordinal)
if tc.Status.PD.Leader.Name == upgradePodName {
var targetName string
if ordinal == *newSet.Spec.Replicas-1 {
targetName = pdPodName(tcName, 0)
} else {
targetName = pdPodName(tcName, *newSet.Spec.Replicas-1)
revision, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pod: [%s] have not label: %s", ns, tcName, podName, apps.ControllerRevisionHashLabelKey)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/have not/has no/

}
err := pu.transferPDLeaderTo(tc, targetName)
if err != nil {
return err

if revision == tc.Status.PD.StatefulSet.UpdateRevision {
if member, exist := tc.Status.PD.Members[podName]; !exist || !member.Health {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pod: [%s] are not ready", ns, tcName, podName)
}
continue
}
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName)
} else {
setUpgradePartition(newSet, ordinal)

return pu.upgradePDPod(tc, i, newSet)
}

return nil
Expand Down Expand Up @@ -131,6 +124,53 @@ func (pu *pdUpgrader) needForceUpgrade(tc *v1alpha1.TidbCluster) (bool, error) {
return imagePullFailedCount >= int(tc.Status.PD.StatefulSet.Replicas)/2+1, nil
}

func (pu *pdUpgrader) upgradePDPod(tc *v1alpha1.TidbCluster, ordinal int32, newSet *apps.StatefulSet) error {
ns := tc.GetNamespace()
tcName := tc.GetName()
upgradePodName := pdPodName(tcName, ordinal)
if tc.Status.PD.Leader.Name == upgradePodName {
lastOrdinal := tc.Status.PD.StatefulSet.Replicas - 1
var targetName string
if ordinal == lastOrdinal {
targetName = pdPodName(tcName, 0)
} else {
targetName = pdPodName(tcName, lastOrdinal)
}
err := pu.transferPDLeaderTo(tc, targetName)
if err != nil {
return err
}
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName)
}

setUpgradePartition(newSet, ordinal)
return nil
}

func (pu *pdUpgrader) pdStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method is unused and duplicated with the method in pd_member_manager.go

if statefulSetIsUpgrading(set) {
return true, nil
}
selector, err := label.New().Cluster(tc.GetName()).PD().Selector()
if err != nil {
return false, err
}
pdPods, err := pu.podLister.Pods(tc.GetNamespace()).List(selector)
if err != nil {
return false, err
}
for _, pod := range pdPods {
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return false, nil
}
if revisionHash != tc.Status.PD.StatefulSet.UpdateRevision {
return true, nil
}
}
return false, nil
}

func (pu *pdUpgrader) transferPDLeaderTo(tc *v1alpha1.TidbCluster, targetName string) error {
return pu.pdControl.GetPDClient(tc).TransferPDLeader(targetName)
}
Expand Down
31 changes: 27 additions & 4 deletions pkg/manager/member/pd_upgrader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func TestPDUpgraderUpgrade(t *testing.T) {
name string
changeFn func(*v1alpha1.TidbCluster)
changePods func(pods []*corev1.Pod)
changeOldSet func(set *apps.StatefulSet)
transferLeaderErr bool
errExpectFn func(*GomegaWithT, error)
expectFn func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet)
Expand Down Expand Up @@ -71,9 +72,13 @@ func TestPDUpgraderUpgrade(t *testing.T) {
podInformer.Informer().GetIndexer().Add(pods[i])
}

oldSet := newStatefulSetForPDUpgrader()
newSet := oldSet.DeepCopy()
newSet.Spec.Template.Spec.Containers[0].Image = "pd-test-images:v2"
newSet := newStatefulSetForPDUpgrader()
oldSet := newSet.DeepCopy()
if test.changeOldSet != nil {
test.changeOldSet(oldSet)
}
SetLastAppliedConfigAnnotation(oldSet)

newSet.Spec.UpdateStrategy.RollingUpdate.Partition = func() *int32 { i := int32(3); return &i }()

err := upgrader.Upgrade(tc, oldSet, newSet)
Expand All @@ -97,6 +102,24 @@ func TestPDUpgraderUpgrade(t *testing.T) {
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(1); return &i }()))
},
},
{
name: "newSet template changed",
changeFn: func(tc *v1alpha1.TidbCluster) {
tc.Status.PD.Synced = true
},
changePods: nil,
changeOldSet: func(set *apps.StatefulSet) {
set.Spec.Template.Spec.Containers[0].Image = "pd-test-image:old"
},
transferLeaderErr: false,
errExpectFn: func(g *GomegaWithT, err error) {
g.Expect(err).NotTo(HaveOccurred())
},
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) {
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase))
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(3); return &i }()))
},
},
{
name: "skip to wait all members health",
changeFn: func(tc *v1alpha1.TidbCluster) {
Expand All @@ -106,7 +129,7 @@ func TestPDUpgraderUpgrade(t *testing.T) {
changePods: nil,
transferLeaderErr: false,
errExpectFn: func(g *GomegaWithT, err error) {
g.Expect(err.Error()).To(Equal("tidbcluster: [default/upgrader]'s pd upgraded pods are not all ready"))
g.Expect(err.Error()).To(Equal(fmt.Sprintf("tidbcluster: [default/upgrader]'s pd upgraded pod: [%s] are not ready", pdPodName(upgradeTcName, 2))))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/are/is/

},
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) {
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase))
Expand Down
33 changes: 32 additions & 1 deletion pkg/manager/member/tidb_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ type tidbMemberManager struct {
tidbControl controller.TiDBControlInterface
setLister v1beta1.StatefulSetLister
svcLister corelisters.ServiceLister
podLister corelisters.PodLister
tidbUpgrader Upgrader
autoFailover bool
tidbFailover Failover
Expand All @@ -47,6 +48,7 @@ func NewTiDBMemberManager(setControl controller.StatefulSetControlInterface,
tidbControl controller.TiDBControlInterface,
setLister v1beta1.StatefulSetLister,
svcLister corelisters.ServiceLister,
podLister corelisters.PodLister,
tidbUpgrader Upgrader,
autoFailover bool,
tidbFailover Failover) manager.Manager {
Expand All @@ -56,6 +58,7 @@ func NewTiDBMemberManager(setControl controller.StatefulSetControlInterface,
tidbControl: tidbControl,
setLister: setLister,
svcLister: svcLister,
podLister: podLister,
tidbUpgrader: tidbUpgrader,
autoFailover: autoFailover,
tidbFailover: tidbFailover,
Expand Down Expand Up @@ -334,7 +337,11 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust
func (tmm *tidbMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, set *apps.StatefulSet) error {
tc.Status.TiDB.StatefulSet = &set.Status

if statefulSetIsUpgrading(set) {
upgrading, err := tmm.tidbStatefulSetIsUpgrading(set, tc)
if err != nil {
return err
}
if upgrading {
tc.Status.TiDB.Phase = v1alpha1.UpgradePhase
} else {
tc.Status.TiDB.Phase = v1alpha1.NormalPhase
Expand All @@ -360,3 +367,27 @@ func (tmm *tidbMemberManager) syncTidbClusterStatus(tc *v1alpha1.TidbCluster, se

return nil
}

func (tmm *tidbMemberManager) tidbStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) {
if statefulSetIsUpgrading(set) {
return true, nil
}
selector, err := label.New().Cluster(tc.GetName()).TiDB().Selector()
if err != nil {
return false, err
}
tidbPods, err := tmm.podLister.Pods(tc.GetNamespace()).List(selector)
if err != nil {
return false, err
}
for _, pod := range tidbPods {
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey]
if !exist {
return false, nil
}
if revisionHash != tc.Status.TiDB.StatefulSet.UpdateRevision {
return true, nil
}
}
return false, nil
}
2 changes: 2 additions & 0 deletions pkg/manager/member/tidb_member_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func newFakeTiDBMemberManager() (*tidbMemberManager, *controller.FakeStatefulSet
setInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Apps().V1beta1().StatefulSets()
tcInformer := informers.NewSharedInformerFactory(cli, 0).Pingcap().V1alpha1().TidbClusters()
svcInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Core().V1().Services()
podInformer := kubeinformers.NewSharedInformerFactory(kubeCli, 0).Core().V1().Pods()
setControl := controller.NewFakeStatefulSetControl(setInformer, tcInformer)
svcControl := controller.NewFakeServiceControl(svcInformer, tcInformer)
tidbUpgrader := NewFakeTiDBUpgrader()
Expand All @@ -206,6 +207,7 @@ func newFakeTiDBMemberManager() (*tidbMemberManager, *controller.FakeStatefulSet
tidbControl,
setInformer.Lister(),
svcInformer.Lister(),
podInformer.Lister(),
tidbUpgrader,
true,
tidbFailover,
Expand Down
Loading