diff --git a/cmd/node_problem_detector.go b/cmd/node_problem_detector.go index c64b1113a..e62108f97 100644 --- a/cmd/node_problem_detector.go +++ b/cmd/node_problem_detector.go @@ -26,6 +26,7 @@ import ( "github.com/golang/glog" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/node-problem-detector/cmd/options" "k8s.io/node-problem-detector/pkg/custompluginmonitor" "k8s.io/node-problem-detector/pkg/problemclient" @@ -95,7 +96,25 @@ func main() { startHTTPServer(p, npdo) } + // This function may be blocked (until a timeout occurs) before + // kube-apiserver becomes ready. + glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout) + if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil { + glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err) + } + if err := p.Run(); err != nil { glog.Fatalf("Problem detector failed with error: %v", err) } } + +func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error { + return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) { + // If NPD can get the node object from kube-apiserver, the server is + // ready and the RBAC permission is set correctly. + if _, err := c.GetNode(); err == nil { + return true, nil + } + return false, nil + }) +} diff --git a/cmd/options/options.go b/cmd/options/options.go index 4c10ec8cd..afd49bc7d 100644 --- a/cmd/options/options.go +++ b/cmd/options/options.go @@ -20,6 +20,7 @@ import ( "flag" "fmt" "os" + "time" "net/url" @@ -46,6 +47,12 @@ type NodeProblemDetectorOptions struct { ServerPort int // ServerAddress is the address to bind the node problem detector server. ServerAddress string + // APIServerWaitTimeout is the timeout on waiting for kube-apiserver to be + // ready. + APIServerWaitTimeout time.Duration + // APIServerWaitInterval is the interval between the checks on the + // readiness of kube-apiserver. + APIServerWaitInterval time.Duration // application options @@ -65,6 +72,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) { []string{}, "List of paths to custom plugin monitor config files, comma separated.") fs.StringVar(&npdo.ApiServerOverride, "apiserver-override", "", "Custom URI used to connect to Kubernetes ApiServer") + fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.") + fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.") fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit") fs.StringVar(&npdo.HostnameOverride, "hostname-override", "", "Custom node name used to override hostname") diff --git a/pkg/problemclient/fake_problem_client.go b/pkg/problemclient/fake_problem_client.go index 6f878493b..50e47350e 100644 --- a/pkg/problemclient/fake_problem_client.go +++ b/pkg/problemclient/fake_problem_client.go @@ -21,7 +21,7 @@ import ( "reflect" "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" ) // FakeProblemClient is a fake problem client for debug. @@ -92,3 +92,7 @@ func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.N // Eventf does nothing now. func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) { } + +func (f *FakeProblemClient) GetNode() (*v1.Node, error) { + return nil, fmt.Errorf("GetNode() not implemented") +} diff --git a/pkg/problemclient/problem_client.go b/pkg/problemclient/problem_client.go index c2f841416..70a022ad6 100644 --- a/pkg/problemclient/problem_client.go +++ b/pkg/problemclient/problem_client.go @@ -26,7 +26,7 @@ import ( typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/kubernetes/pkg/api/legacyscheme" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/clock" @@ -47,6 +47,9 @@ type Client interface { SetConditions(conditions []v1.NodeCondition) error // Eventf reports the event. Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) + // GetNode returns the Node object of the node on which the + // node-problem-detector runs. + GetNode() (*v1.Node, error) } type nodeProblemClient struct { @@ -79,7 +82,7 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client { } func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) { - node, err := c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) + node, err := c.GetNode() if err != nil { return nil, err } @@ -116,6 +119,10 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...) } +func (c *nodeProblemClient) GetNode() (*v1.Node, error) { + return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) +} + // generatePatch generates condition patch func generatePatch(conditions []v1.NodeCondition) ([]byte, error) { raw, err := json.Marshal(&conditions)