From ba17805ff50cf61c56f8aac9b996b81420baa971 Mon Sep 17 00:00:00 2001 From: Dan Kanefsky <56059752+boojamya@users.noreply.github.com> Date: Mon, 18 Dec 2023 10:51:59 -0800 Subject: [PATCH] Add Prometheus metrics for `unrelayed-packets` and `unrelayed-acknoledgments` (#1356) * unrelayed packets metric * feedback --- docs/advanced_usage.md | 15 +++++++++------ relayer/processor/metrics.go | 19 +++++++++++++++++++ relayer/processor/path_processor_internal.go | 12 ++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 8b17fafdd..996a694c1 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -12,15 +12,18 @@ Exported metrics: | **Exported Metric** | **Description** | **Type** | |:---------------------------------------------: |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |:--------: | -| cosmos_relayer_observed_packets_total | The total number of observed packets | Counter | -| cosmos_relayer_relayed_packets_total | The total number of relayed packets | Counter | +| cosmos_relayer_observed_packets_total | The total number of observed packets | Counter | +| cosmos_relayer_relayed_packets_total | The total number of relayed packets | Counter | | cosmos_relayer_chain_latest_height | The current height of the chain | Gauge | | cosmos_relayer_wallet_balance | The current balance for the relayer's wallet | Gauge | | cosmos_relayer_fees_spent | The amount of fees spent from the relayer's wallet | Gauge | -| cosmos_relayer_tx_failure |
The total number of tx failures broken up into categories:
- "packet messages are redundant"
- "insufficient funds"
- "invalid coins"
- "out of gas"


"Tx Failure" is the the catch all bucket | Counter | -| cosmos_relayer_block_query_errors_total | The total number of block query failures. The failures are separated into two categories:
- "RPC Client"
- "IBC Header" | Counter | -| cosmos_relayer_client_expiration_seconds | Seconds until the client expires | Gauge | -| cosmos_relayer_client_trusting_period_seconds | The trusting period (in seconds) of the client | Gauge | +| cosmos_relayer_tx_failure |
The total number of tx failures broken up into categories:
- "packet messages are redundant"
- "insufficient funds"
- "invalid coins"
- "out of gas"


"Tx Failure" is the the catch all bucket | Counter | +| cosmos_relayer_block_query_errors_total | The total number of block query failures. The failures are separated into two categories:
- "RPC Client"
- "IBC Header" | Counter | +| cosmos_relayer_client_expiration_seconds | Seconds until the client expires | Gauge | +| cosmos_relayer_client_trusting_period_seconds | The trusting period (in seconds) of the client | Gauge | +| cosmos_relayer_unrelayed_packets | Current number of unrelayed packet sequences on a specific path and channel. This is updated after each flush (default is 5 min) | Gauge | +| cosmos_relayer_unrelayed_acks | Current number of unrelayed acknoledgment sequences on a specific path and channel. This is updated after each flush (default is 5 min) | Gauge | + diff --git a/relayer/processor/metrics.go b/relayer/processor/metrics.go index 61f1a9fbe..2630fcd2b 100644 --- a/relayer/processor/metrics.go +++ b/relayer/processor/metrics.go @@ -18,6 +18,8 @@ type PrometheusMetrics struct { BlockQueryFailure *prometheus.CounterVec ClientExpiration *prometheus.GaugeVec ClientTrustingPeriod *prometheus.GaugeVec + UnrelayedPackets *prometheus.GaugeVec + UnrelayedAcks *prometheus.GaugeVec } func (m *PrometheusMetrics) AddPacketsObserved(pathName, chain, channel, port, eventType string, count int) { @@ -56,6 +58,14 @@ func (m *PrometheusMetrics) IncTxFailure(pathName, chain, errDesc string) { m.TxFailureError.WithLabelValues(pathName, chain, errDesc).Inc() } +func (m *PrometheusMetrics) SetUnrelayedPackets(pathName, srcChain, destChain, srcChannel, destChannel string, unrelayedPackets int) { + m.UnrelayedPackets.WithLabelValues(pathName, srcChain, destChain, srcChannel, destChannel).Set(float64(unrelayedPackets)) +} + +func (m *PrometheusMetrics) SetUnrelayedAcks(pathName, srcChain, destChain, srcChannel, destChannel string, UnrelayedAcks int) { + m.UnrelayedAcks.WithLabelValues(pathName, srcChain, destChain, srcChannel, destChannel).Set(float64(UnrelayedAcks)) +} + func NewPrometheusMetrics() *PrometheusMetrics { packetLabels := []string{"path_name", "chain", "channel", "port", "type"} heightLabels := []string{"chain"} @@ -64,6 +74,7 @@ func NewPrometheusMetrics() *PrometheusMetrics { walletLabels := []string{"chain", "gas_price", "key", "address", "denom"} clientExpirationLables := []string{"path_name", "chain", "client_id", "trusting_period"} clientTrustingPeriodLables := []string{"path_name", "chain", "client_id"} + unrelayedSeqsLabels := []string{"path_name", "src_chain", "dest_chain", "src_channel", "dest_channel"} registry := prometheus.NewRegistry() registerer := promauto.With(registry) return &PrometheusMetrics{ @@ -104,5 +115,13 @@ func NewPrometheusMetrics() *PrometheusMetrics { Name: "cosmos_relayer_client_trusting_period_seconds", Help: "The trusting period (in seconds) of the client", }, clientTrustingPeriodLables), + UnrelayedPackets: registerer.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cosmos_relayer_unrelayed_packets", + Help: "Current number of unrelayed packets on both the source and destination chains for a specific path and channel", + }, unrelayedSeqsLabels), + UnrelayedAcks: registerer.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cosmos_relayer_unrelayed_acks", + Help: "Current number of unrelayed acknowledgements on both the source and destination chains for a specific path and channel", + }, unrelayedSeqsLabels), } } diff --git a/relayer/processor/path_processor_internal.go b/relayer/processor/path_processor_internal.go index 1af9eceae..1f2c627bb 100644 --- a/relayer/processor/path_processor_internal.go +++ b/relayer/processor/path_processor_internal.go @@ -1204,6 +1204,10 @@ func (pp *PathProcessor) queuePendingRecvAndAcks( if len(seqs) == 0 { src.log.Debug("Nothing to flush", zap.String("channel", k.ChannelID), zap.String("port", k.PortID)) + if pp.metrics != nil { + pp.metrics.SetUnrelayedPackets(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, 0) + pp.metrics.SetUnrelayedAcks(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, 0) + } return nil, nil } @@ -1214,6 +1218,10 @@ func (pp *PathProcessor) queuePendingRecvAndAcks( return nil, err } + if pp.metrics != nil { + pp.metrics.SetUnrelayedPackets(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, len(unrecv)) + } + dstHeight := int64(dst.latestBlock.Height) var order chantypes.Order @@ -1327,6 +1335,10 @@ SeqLoop: unacked = append(unacked, seq) } + if pp.metrics != nil { + pp.metrics.SetUnrelayedAcks(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, len(unacked)) + } + for i, seq := range unacked { dstMu.Lock() ck := k.Counterparty()