Skip to content

Commit

Permalink
Merge remote-tracking branch 'main/master' into local-ssd-provision
Browse files Browse the repository at this point in the history
  • Loading branch information
gregwebs committed Jun 13, 2019
2 parents da1107e + 1090e7a commit 44afaab
Show file tree
Hide file tree
Showing 17 changed files with 252 additions and 395 deletions.
4 changes: 2 additions & 2 deletions deploy/aliyun/templates/local-volume-provisioner.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ spec:
resources:
requests:
cpu: 100m
memory: 100m
memory: 100Mi
limits:
cpu: 100m
memory: 100m
memory: 100Mi
volumeMounts:
- mountPath: /etc/provisioner/config
name: provisioner-config
Expand Down
4 changes: 2 additions & 2 deletions deploy/aws/manifests/local-volume-provisioner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ spec:
resources:
requests:
cpu: 100m
memory: 100m
memory: 100Mi
limits:
cpu: 100m
memory: 100m
memory: 100Mi
volumeMounts:
- mountPath: /etc/provisioner/config
name: provisioner-config
Expand Down
4 changes: 2 additions & 2 deletions manifests/local-dind/local-volume-provisioner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ spec:
resources:
requests:
cpu: 100m
memory: 100m
memory: 100Mi
limits:
cpu: 100m
memory: 100m
memory: 100Mi
volumeMounts:
- mountPath: /etc/provisioner/config
name: provisioner-config
Expand Down
11 changes: 5 additions & 6 deletions tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ const (
tidbClusterChartName = "tidb-cluster"
backupChartName = "tidb-backup"
statbilityTestTag = "stability"
metricsPort = 8090
)

type OperatorActions interface {
Expand Down Expand Up @@ -157,6 +156,7 @@ type OperatorActions interface {
CheckTidbClustersAvailableOrDie(infos []*TidbClusterConfig)
CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string)
CheckKubeProxyDownOrDie(clusters []*TidbClusterConfig)
RegisterWebHookAndService(context *apimachinery.CertContext, info *OperatorConfig) error
RegisterWebHookAndServiceOrDie(context *apimachinery.CertContext, info *OperatorConfig)
CleanWebHookAndService(info *OperatorConfig) error
Expand Down Expand Up @@ -1540,7 +1540,7 @@ func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterConfig) erro
// Grafana ready, init grafana client, no more sync logic because race condition is okay here
if clusterInfo.GrafanaClient == nil {
grafanaURL := fmt.Sprintf("http://%s.%s:3000", svcName, ns)
client, err := metrics.NewClient(grafanaURL, grafanaUsername, grafanaPassword, metricsPort)
client, err := metrics.NewClient(grafanaURL, grafanaUsername, grafanaPassword)
if err != nil {
return err
}
Expand Down Expand Up @@ -2342,11 +2342,10 @@ func (oa *operatorActions) EventWorker() {
ns := clusterEv.ns
clusterName := clusterEv.clusterName
grafanaURL := fmt.Sprintf("http://%s-grafana.%s:3000", clusterName, ns)
client, err := metrics.NewClient(grafanaURL, grafanaUsername, grafanaPassword, metricsPort)
client, err := metrics.NewClient(grafanaURL, grafanaUsername, grafanaPassword)
if err != nil {
retryEvents = append(retryEvents, ev)
glog.V(4).Infof("failed to new grafana client: [%s/%s], %v", ns, clusterName, err)
continue
// If parse grafana URL failed, this error cannot be recovered by retrying, so send error msg and panic
slack.NotifyAndPanic(fmt.Errorf("failed to parse grafana URL so can't new grafana client: %s, %v", grafanaURL, err))
}

anno := metrics.Annotation{
Expand Down
21 changes: 3 additions & 18 deletions tests/cmd/fault-trigger/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ import (
_ "net/http/pprof"
"time"

"github.com/pingcap/tidb-operator/tests/pkg/util"

"github.com/golang/glog"
"github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/api"
"github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/manager"
Expand All @@ -30,17 +28,13 @@ import (
)

var (
port int
pprofPort int
kubeProxyImage string
hostnameOverride string
port int
pprofPort int
)

func init() {
flag.IntVar(&port, "port", 23332, "The port that the fault trigger's http service runs on (default 23332)")
flag.IntVar(&pprofPort, "pprof-port", 6060, "The port that the pprof's http service runs on (default 6060)")
flag.StringVar(&kubeProxyImage, "kube-proxy-image", "k8s.gcr.io/kube-proxy:v1.12.2", "The kube proxy image (default k8s.gcr.io/kube-proxy:v1.12.2)")
flag.StringVar(&hostnameOverride, "hostname-override", "", "If non-empty, will use this string as identification instead of the actual hostname")

flag.Parse()
}
Expand All @@ -49,16 +43,7 @@ func main() {
logs.InitLogs()
defer logs.FlushLogs()

mgr := manager.NewManager(kubeProxyImage)
hostname, err := util.GetHostname(hostnameOverride)
if err != nil {
glog.Fatalf("get hostname failed, err: %v", err)
}
err = mgr.UpdateKubeProxyDaemonset(hostname)
if err != nil {
glog.Fatalf("update kube-proxy daemonset failed, err: %v", err)
}

mgr := manager.NewManager()
server := api.NewServer(mgr, port)

go wait.Forever(func() {
Expand Down
8 changes: 6 additions & 2 deletions tests/cmd/stability/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

"github.com/pingcap/tidb-operator/tests/pkg/apimachinery"

"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"

"github.com/golang/glog"
"github.com/jinzhu/copier"
Expand Down Expand Up @@ -332,12 +332,16 @@ func run() {
// stop one etcd node and k8s/operator/tidbcluster is available
faultEtcd := tests.SelectNode(cfg.ETCDs)
fta.StopETCDOrDie(faultEtcd)
defer fta.StartETCDOrDie(faultEtcd)
// TODO make the pause interval as a argument
time.Sleep(3 * time.Minute)
oa.CheckOneEtcdDownOrDie(operatorCfg, allClusters, faultEtcd)
fta.StartETCDOrDie(faultEtcd)

// stop all kube-proxy and k8s/operator/tidbcluster is available
fta.StopKubeProxyOrDie()
oa.CheckKubeProxyDownOrDie(allClusters)
fta.StartKubeProxyOrDie()

//clean temp dirs when stability success
err := cfg.CleanTempDirs()
if err != nil {
Expand Down
50 changes: 40 additions & 10 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,42 +452,72 @@ func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig,
if err != nil {
return err
}
glog.V(4).Infof("all clusters is available")
glog.V(4).Infof("all clusters are available")
return nil
})
}

func (oa *operatorActions) CheckKubeProxyDownOrDie(clusters []*TidbClusterConfig) {
glog.Infof("checking k8s/tidbCluster status when kube-proxy down")

KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(nil, nil)
if err != nil {
return err

}
glog.V(4).Infof("k8s cluster is available.")
err = oa.CheckTidbClustersAvailable(clusters)
if err != nil {
return err
}
glog.V(4).Infof("all clusters are available.")
return nil
})
}

func (oa *operatorActions) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) {
glog.Infof("check k8s/operator/tidbCluster status when one apiserver down")
affectedPods := map[string]*corev1.Pod{}
apiserverPod, err := GetApiserverPod(oa.kubeCli, faultNode)
apiserverPod, err := GetKubeApiserverPod(oa.kubeCli, faultNode)
if err != nil {
slack.NotifyAndPanic(fmt.Errorf("can't find apiserver in node:%s", faultNode))
slack.NotifyAndPanic(fmt.Errorf("can't find apiserver in k8s cluster"))
}
if apiserverPod != nil {
affectedPods[apiserverPod.GetName()] = apiserverPod
}
controllerPod, err := GetControllerManagerPod(oa.kubeCli, faultNode)

controllerPod, err := GetKubeControllerManagerPod(oa.kubeCli, faultNode)
if err != nil {
glog.Infof("can't find controllerManager in node:%s", faultNode)
slack.NotifyAndPanic(fmt.Errorf("can't find kube-controller-manager in k8s cluster"))
}
if controllerPod != nil {
affectedPods[controllerPod.GetName()] = controllerPod
}
schedulerPod, err := GetSchedulerPod(oa.kubeCli, faultNode)

schedulerPod, err := GetKubeSchedulerPod(oa.kubeCli, faultNode)
if err != nil {
glog.Infof("can't find scheduler in node:%s", faultNode)
slack.NotifyAndPanic(fmt.Errorf("can't find kube-scheduler in k8s cluster"))
}
if schedulerPod != nil {
affectedPods[schedulerPod.GetName()] = schedulerPod
}
dnsPod, err := GetDNSPod(oa.kubeCli, faultNode)

dnsPod, err := GetKubeDNSPod(oa.kubeCli, faultNode)
if err != nil {
slack.NotifyAndPanic(fmt.Errorf("can't find controller-manager in node:%s", faultNode))
slack.NotifyAndPanic(fmt.Errorf("can't find kube-dns in k8s cluster"))
}
if dnsPod != nil {
affectedPods[dnsPod.GetName()] = dnsPod
}

proxyPod, err := GetKubeProxyPod(oa.kubeCli, faultNode)
if err != nil {
slack.NotifyAndPanic(fmt.Errorf("can't find kube-proxy in k8s cluster"))
}
if proxyPod != nil {
affectedPods[dnsPod.GetName()] = proxyPod
}
KeepOrDie(3*time.Second, 10*time.Minute, func() error {
err := oa.CheckK8sAvailable(map[string]string{faultNode: faultNode}, affectedPods)
if err != nil {
Expand Down Expand Up @@ -555,7 +585,7 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, exc
}
podState := GetPodStatus(&pod)
if podState != string(corev1.PodRunning) {
return false, fmt.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetName(), pod.GetNamespace(), podState)
return false, fmt.Errorf("pod:[%s/%s] is unavailable,state is %s", pod.GetNamespace(), pod.GetName(), podState)
}
}
return true, nil
Expand Down
Loading

0 comments on commit 44afaab

Please sign in to comment.