diff --git a/clients/pkg/promtail/targets/gcplog/metrics.go b/clients/pkg/promtail/targets/gcplog/metrics.go index b149aa9fad9f1..9eb51a7855a12 100644 --- a/clients/pkg/promtail/targets/gcplog/metrics.go +++ b/clients/pkg/promtail/targets/gcplog/metrics.go @@ -7,8 +7,9 @@ type Metrics struct { // reg is the Registerer used to create this set of metrics. reg prometheus.Registerer - gcplogEntries *prometheus.CounterVec - gcplogErrors *prometheus.CounterVec + gcplogEntries *prometheus.CounterVec + gcplogErrors *prometheus.CounterVec + gcplogTargetLastSuccessScrape *prometheus.GaugeVec } // NewMetrics creates a new set of metrics. Metrics will be registered to reg. @@ -28,6 +29,12 @@ func NewMetrics(reg prometheus.Registerer) *Metrics { Help: "Total number of parsing errors while receiving gcplog messages", }, []string{"project"}) + m.gcplogTargetLastSuccessScrape = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "promtail", + Name: "gcplog_target_last_success_scrape", + Help: "Timestamp of the specific target's last successful poll", + }, []string{"project", "target"}) + reg.MustRegister(m.gcplogEntries, m.gcplogErrors) return &m } diff --git a/clients/pkg/promtail/targets/gcplog/target.go b/clients/pkg/promtail/targets/gcplog/target.go index 30d1e8314a8df..7dd2ff8a2a811 100644 --- a/clients/pkg/promtail/targets/gcplog/target.go +++ b/clients/pkg/promtail/targets/gcplog/target.go @@ -108,9 +108,9 @@ func (t *GcplogTarget) run() error { t.msgs <- m }) if err != nil { - // TODO(kavi): Add proper error propagation maybe? - level.Error(t.logger).Log("error", err) + level.Error(t.logger).Log("msg", "failed to receive pubsub messages", "error", err) t.metrics.gcplogErrors.WithLabelValues(t.config.ProjectID).Inc() + t.metrics.gcplogTargetLastSuccessScrape.WithLabelValues(t.config.ProjectID, t.config.Subscription).SetToCurrentTime() } }() @@ -138,7 +138,11 @@ func (t *GcplogTarget) Type() target.TargetType { } func (t *GcplogTarget) Ready() bool { - return t.ctx.Err() == nil + // Return true just like all other targets. + // Rationale is gcplog scraping shouldn't stop because of some transient timeout errors. + // This transient failure can cause promtail readyness probe to fail which may prevent pod from starting. + // We have metrics now to track if scraping failed (`gcplog_target_last_success_scrape`). + return true } func (t *GcplogTarget) DiscoveredLabels() model.LabelSet {