-
Notifications
You must be signed in to change notification settings - Fork 502
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: can't recover after a upgrade failed #120
Changes from 18 commits
a7de64c
2cfbde5
2b78c7b
fc67d9a
19ac0fa
16b03a3
99f3a37
bf8b5a4
5781db6
756691a
645f067
4cf1d11
24524f2
8d149b7
19eddb5
3afa97a
3caece1
bd16aaf
2de52da
f666f20
5938988
7cb3f53
5429058
8ea29e7
7239470
e0b1163
63d333f
8b29919
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,38 +66,31 @@ func (pu *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Sta | |
} | ||
|
||
tc.Status.PD.Phase = v1alpha1.UpgradePhase | ||
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition) | ||
|
||
if tc.Status.PD.StatefulSet.CurrentReplicas == 0 { | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd doesn't have old version pod to upgrade", ns, tcName) | ||
} | ||
|
||
if !tc.PDAllPodsStarted() { | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pods are not all created", ns, tcName) | ||
if !templateEqual(newSet.Spec.Template, oldSet.Spec.Template) { | ||
return nil | ||
} | ||
|
||
for i := tc.Status.PD.StatefulSet.Replicas; i > tc.Status.PD.StatefulSet.CurrentReplicas; i-- { | ||
if member, exist := tc.Status.PD.Members[pdPodName(tcName, i-1)]; !exist || !member.Health { | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pods are not all ready", ns, tcName) | ||
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line should be added to the |
||
for i := tc.Status.PD.StatefulSet.Replicas - 1; i >= 0; i-- { | ||
podName := pdPodName(tcName, i) | ||
pod, err := pu.podLister.Pods(ns).Get(podName) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
ordinal := tc.Status.PD.StatefulSet.CurrentReplicas - 1 | ||
upgradePodName := pdPodName(tcName, ordinal) | ||
if tc.Status.PD.Leader.Name == upgradePodName { | ||
var targetName string | ||
if ordinal == *newSet.Spec.Replicas-1 { | ||
targetName = pdPodName(tcName, 0) | ||
} else { | ||
targetName = pdPodName(tcName, *newSet.Spec.Replicas-1) | ||
revision, exist := pod.Labels[apps.ControllerRevisionHashLabelKey] | ||
if !exist { | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd pod: [%s] have not label: %s", ns, tcName, podName, apps.ControllerRevisionHashLabelKey) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/have not/has no/ |
||
} | ||
err := pu.transferPDLeaderTo(tc, targetName) | ||
if err != nil { | ||
return err | ||
|
||
if revision == tc.Status.PD.StatefulSet.UpdateRevision { | ||
if member, exist := tc.Status.PD.Members[podName]; !exist || !member.Health { | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pod: [%s] are not ready", ns, tcName, podName) | ||
} | ||
continue | ||
} | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName) | ||
} else { | ||
setUpgradePartition(newSet, ordinal) | ||
|
||
return pu.upgradePDPod(tc, i, newSet) | ||
} | ||
|
||
return nil | ||
|
@@ -131,6 +124,53 @@ func (pu *pdUpgrader) needForceUpgrade(tc *v1alpha1.TidbCluster) (bool, error) { | |
return imagePullFailedCount >= int(tc.Status.PD.StatefulSet.Replicas)/2+1, nil | ||
} | ||
|
||
func (pu *pdUpgrader) upgradePDPod(tc *v1alpha1.TidbCluster, ordinal int32, newSet *apps.StatefulSet) error { | ||
ns := tc.GetNamespace() | ||
tcName := tc.GetName() | ||
upgradePodName := pdPodName(tcName, ordinal) | ||
if tc.Status.PD.Leader.Name == upgradePodName { | ||
lastOrdinal := tc.Status.PD.StatefulSet.Replicas - 1 | ||
var targetName string | ||
if ordinal == lastOrdinal { | ||
targetName = pdPodName(tcName, 0) | ||
} else { | ||
targetName = pdPodName(tcName, lastOrdinal) | ||
} | ||
err := pu.transferPDLeaderTo(tc, targetName) | ||
if err != nil { | ||
return err | ||
} | ||
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd member: [%s] is transferring leader to pd member: [%s]", ns, tcName, upgradePodName, targetName) | ||
} | ||
|
||
setUpgradePartition(newSet, ordinal) | ||
return nil | ||
} | ||
|
||
func (pu *pdUpgrader) pdStatefulSetIsUpgrading(set *apps.StatefulSet, tc *v1alpha1.TidbCluster) (bool, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method is unused and duplicated with the method in |
||
if statefulSetIsUpgrading(set) { | ||
return true, nil | ||
} | ||
selector, err := label.New().Cluster(tc.GetName()).PD().Selector() | ||
if err != nil { | ||
return false, err | ||
} | ||
pdPods, err := pu.podLister.Pods(tc.GetNamespace()).List(selector) | ||
if err != nil { | ||
return false, err | ||
} | ||
for _, pod := range pdPods { | ||
revisionHash, exist := pod.Labels[apps.ControllerRevisionHashLabelKey] | ||
if !exist { | ||
return false, nil | ||
} | ||
if revisionHash != tc.Status.PD.StatefulSet.UpdateRevision { | ||
return true, nil | ||
} | ||
} | ||
return false, nil | ||
} | ||
|
||
func (pu *pdUpgrader) transferPDLeaderTo(tc *v1alpha1.TidbCluster, targetName string) error { | ||
return pu.pdControl.GetPDClient(tc).TransferPDLeader(targetName) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ func TestPDUpgraderUpgrade(t *testing.T) { | |
name string | ||
changeFn func(*v1alpha1.TidbCluster) | ||
changePods func(pods []*corev1.Pod) | ||
changeOldSet func(set *apps.StatefulSet) | ||
transferLeaderErr bool | ||
errExpectFn func(*GomegaWithT, error) | ||
expectFn func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) | ||
|
@@ -71,9 +72,13 @@ func TestPDUpgraderUpgrade(t *testing.T) { | |
podInformer.Informer().GetIndexer().Add(pods[i]) | ||
} | ||
|
||
oldSet := newStatefulSetForPDUpgrader() | ||
newSet := oldSet.DeepCopy() | ||
newSet.Spec.Template.Spec.Containers[0].Image = "pd-test-images:v2" | ||
newSet := newStatefulSetForPDUpgrader() | ||
oldSet := newSet.DeepCopy() | ||
if test.changeOldSet != nil { | ||
test.changeOldSet(oldSet) | ||
} | ||
SetLastAppliedConfigAnnotation(oldSet) | ||
|
||
newSet.Spec.UpdateStrategy.RollingUpdate.Partition = func() *int32 { i := int32(3); return &i }() | ||
|
||
err := upgrader.Upgrade(tc, oldSet, newSet) | ||
|
@@ -97,6 +102,24 @@ func TestPDUpgraderUpgrade(t *testing.T) { | |
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(1); return &i }())) | ||
}, | ||
}, | ||
{ | ||
name: "newSet template changed", | ||
changeFn: func(tc *v1alpha1.TidbCluster) { | ||
tc.Status.PD.Synced = true | ||
}, | ||
changePods: nil, | ||
changeOldSet: func(set *apps.StatefulSet) { | ||
set.Spec.Template.Spec.Containers[0].Image = "pd-test-image:old" | ||
}, | ||
transferLeaderErr: false, | ||
errExpectFn: func(g *GomegaWithT, err error) { | ||
g.Expect(err).NotTo(HaveOccurred()) | ||
}, | ||
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) { | ||
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase)) | ||
g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(func() *int32 { i := int32(3); return &i }())) | ||
}, | ||
}, | ||
{ | ||
name: "skip to wait all members health", | ||
changeFn: func(tc *v1alpha1.TidbCluster) { | ||
|
@@ -106,7 +129,7 @@ func TestPDUpgraderUpgrade(t *testing.T) { | |
changePods: nil, | ||
transferLeaderErr: false, | ||
errExpectFn: func(g *GomegaWithT, err error) { | ||
g.Expect(err.Error()).To(Equal("tidbcluster: [default/upgrader]'s pd upgraded pods are not all ready")) | ||
g.Expect(err.Error()).To(Equal(fmt.Sprintf("tidbcluster: [default/upgrader]'s pd upgraded pod: [%s] are not ready", pdPodName(upgradeTcName, 2)))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/are/is/ |
||
}, | ||
expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) { | ||
g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Set
Partition
to the maxThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And add a comment to indicate the reason why
return nil
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the Partition default is max