elastic · ruflin · Feb 13, 2018 · Feb 9, 2018 · Feb 13, 2018
diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
@@ -30,6 +30,8 @@ https://github.com/elastic/beats/compare/v6.2.0...6.2[Check the HEAD diff]
 
 *Affecting all Beats*
 
+- Fix infinite loop when event unmarshal fails in Kubernetes pod watcher. {pull}6353[6353]
+
 *Auditbeat*
 
 *Filebeat*

diff --git a/libbeat/common/kubernetes/watcher.go b/libbeat/common/kubernetes/watcher.go
@@ -3,6 +3,7 @@ package kubernetes
 import (
 	"context"
 	"errors"
+	"io"
 	"sync"
 	"time"
 
@@ -13,6 +14,9 @@ import (
 	corev1 "github.com/ericchiang/k8s/api/v1"
 )
 
+// Max back off time for retries
+const maxBackoff = 30 * time.Second
+
 // Watcher reads Kubernetes events and keeps a list of known pods
 type Watcher interface {
 	// Start watching Kubernetes API for new containers
@@ -129,27 +133,41 @@ func (p *podWatcher) Start() error {
 }
 
 func (p *podWatcher) watch() {
+	// Failures counter, do exponential backoff on retries
+	var failures uint
+
 	for {
 		logp.Info("kubernetes: %s", "Watching API for pod events")
 		watcher, err := p.client.WatchPods(p.ctx, "", p.nodeFilter, k8s.ResourceVersion(p.lastResourceVersion))
 		if err != nil {
 			//watch pod failures should be logged and gracefully failed over as metadata retrieval
 			//should never stop.
 			logp.Err("kubernetes: Watching API error %v", err)
-			time.Sleep(time.Second)
+			backoff(failures)
+			failures++
 			continue
 		}
 
 		for {
 			_, apiPod, err := watcher.Next()
 			if err != nil {
 				logp.Err("kubernetes: Watching API error %v", err)
-				watcher.Close()
-				break
+
+				// In case of EOF, stop watching and restart the process
+				if err == io.EOF || err == io.ErrUnexpectedEOF {
+					watcher.Close()
+					backoff(failures)
+					failures++
+					break
+				}
+
+				// Otherwise, this is probably an unknown event (unmarshal error), ignore it
+				continue
 			}
 
-			// Update last resource version
+			// Update last resource version and reset failure counter
 			p.lastResourceVersion = apiPod.Metadata.GetResourceVersion()
+			failures = 0
 
 			pod := GetPod(apiPod)
 			if pod.Metadata.DeletionTimestamp != "" {
@@ -190,6 +208,14 @@ func (p *podWatcher) watch() {
 	}
 }
 
+func backoff(failures uint) {
+	wait := 1 << failures * time.Second
+	if wait > maxBackoff {
+		wait = maxBackoff
+	}
+	time.Sleep(wait)
+}
+
 // Check annotations flagged as deleted for their last access time, fully delete
 // the ones older than p.cleanupTimeout
 func (p *podWatcher) cleanupWorker() {