From ba17805ff50cf61c56f8aac9b996b81420baa971 Mon Sep 17 00:00:00 2001
From: Dan Kanefsky <56059752+boojamya@users.noreply.github.com>
Date: Mon, 18 Dec 2023 10:51:59 -0800
Subject: [PATCH] Add Prometheus metrics for `unrelayed-packets` and
`unrelayed-acknoledgments` (#1356)
* unrelayed packets metric
* feedback
---
docs/advanced_usage.md | 15 +++++++++------
relayer/processor/metrics.go | 19 +++++++++++++++++++
relayer/processor/path_processor_internal.go | 12 ++++++++++++
3 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md
index 8b17fafdd..996a694c1 100644
--- a/docs/advanced_usage.md
+++ b/docs/advanced_usage.md
@@ -12,15 +12,18 @@ Exported metrics:
| **Exported Metric** | **Description** | **Type** |
|:---------------------------------------------: |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |:--------: |
-| cosmos_relayer_observed_packets_total | The total number of observed packets | Counter |
-| cosmos_relayer_relayed_packets_total | The total number of relayed packets | Counter |
+| cosmos_relayer_observed_packets_total | The total number of observed packets | Counter |
+| cosmos_relayer_relayed_packets_total | The total number of relayed packets | Counter |
| cosmos_relayer_chain_latest_height | The current height of the chain | Gauge |
| cosmos_relayer_wallet_balance | The current balance for the relayer's wallet | Gauge |
| cosmos_relayer_fees_spent | The amount of fees spent from the relayer's wallet | Gauge |
-| cosmos_relayer_tx_failure |
The total number of tx failures broken up into categories:
- "packet messages are redundant"
- "insufficient funds"
- "invalid coins"
- "out of gas"
"Tx Failure" is the the catch all bucket | Counter |
-| cosmos_relayer_block_query_errors_total | The total number of block query failures. The failures are separated into two categories:
- "RPC Client"
- "IBC Header" | Counter |
-| cosmos_relayer_client_expiration_seconds | Seconds until the client expires | Gauge |
-| cosmos_relayer_client_trusting_period_seconds | The trusting period (in seconds) of the client | Gauge |
+| cosmos_relayer_tx_failure |
The total number of tx failures broken up into categories:
- "packet messages are redundant"
- "insufficient funds"
- "invalid coins"
- "out of gas"
"Tx Failure" is the the catch all bucket | Counter |
+| cosmos_relayer_block_query_errors_total | The total number of block query failures. The failures are separated into two categories:
- "RPC Client"
- "IBC Header" | Counter |
+| cosmos_relayer_client_expiration_seconds | Seconds until the client expires | Gauge |
+| cosmos_relayer_client_trusting_period_seconds | The trusting period (in seconds) of the client | Gauge |
+| cosmos_relayer_unrelayed_packets | Current number of unrelayed packet sequences on a specific path and channel. This is updated after each flush (default is 5 min) | Gauge |
+| cosmos_relayer_unrelayed_acks | Current number of unrelayed acknoledgment sequences on a specific path and channel. This is updated after each flush (default is 5 min) | Gauge |
+
diff --git a/relayer/processor/metrics.go b/relayer/processor/metrics.go
index 61f1a9fbe..2630fcd2b 100644
--- a/relayer/processor/metrics.go
+++ b/relayer/processor/metrics.go
@@ -18,6 +18,8 @@ type PrometheusMetrics struct {
BlockQueryFailure *prometheus.CounterVec
ClientExpiration *prometheus.GaugeVec
ClientTrustingPeriod *prometheus.GaugeVec
+ UnrelayedPackets *prometheus.GaugeVec
+ UnrelayedAcks *prometheus.GaugeVec
}
func (m *PrometheusMetrics) AddPacketsObserved(pathName, chain, channel, port, eventType string, count int) {
@@ -56,6 +58,14 @@ func (m *PrometheusMetrics) IncTxFailure(pathName, chain, errDesc string) {
m.TxFailureError.WithLabelValues(pathName, chain, errDesc).Inc()
}
+func (m *PrometheusMetrics) SetUnrelayedPackets(pathName, srcChain, destChain, srcChannel, destChannel string, unrelayedPackets int) {
+ m.UnrelayedPackets.WithLabelValues(pathName, srcChain, destChain, srcChannel, destChannel).Set(float64(unrelayedPackets))
+}
+
+func (m *PrometheusMetrics) SetUnrelayedAcks(pathName, srcChain, destChain, srcChannel, destChannel string, UnrelayedAcks int) {
+ m.UnrelayedAcks.WithLabelValues(pathName, srcChain, destChain, srcChannel, destChannel).Set(float64(UnrelayedAcks))
+}
+
func NewPrometheusMetrics() *PrometheusMetrics {
packetLabels := []string{"path_name", "chain", "channel", "port", "type"}
heightLabels := []string{"chain"}
@@ -64,6 +74,7 @@ func NewPrometheusMetrics() *PrometheusMetrics {
walletLabels := []string{"chain", "gas_price", "key", "address", "denom"}
clientExpirationLables := []string{"path_name", "chain", "client_id", "trusting_period"}
clientTrustingPeriodLables := []string{"path_name", "chain", "client_id"}
+ unrelayedSeqsLabels := []string{"path_name", "src_chain", "dest_chain", "src_channel", "dest_channel"}
registry := prometheus.NewRegistry()
registerer := promauto.With(registry)
return &PrometheusMetrics{
@@ -104,5 +115,13 @@ func NewPrometheusMetrics() *PrometheusMetrics {
Name: "cosmos_relayer_client_trusting_period_seconds",
Help: "The trusting period (in seconds) of the client",
}, clientTrustingPeriodLables),
+ UnrelayedPackets: registerer.NewGaugeVec(prometheus.GaugeOpts{
+ Name: "cosmos_relayer_unrelayed_packets",
+ Help: "Current number of unrelayed packets on both the source and destination chains for a specific path and channel",
+ }, unrelayedSeqsLabels),
+ UnrelayedAcks: registerer.NewGaugeVec(prometheus.GaugeOpts{
+ Name: "cosmos_relayer_unrelayed_acks",
+ Help: "Current number of unrelayed acknowledgements on both the source and destination chains for a specific path and channel",
+ }, unrelayedSeqsLabels),
}
}
diff --git a/relayer/processor/path_processor_internal.go b/relayer/processor/path_processor_internal.go
index 1af9eceae..1f2c627bb 100644
--- a/relayer/processor/path_processor_internal.go
+++ b/relayer/processor/path_processor_internal.go
@@ -1204,6 +1204,10 @@ func (pp *PathProcessor) queuePendingRecvAndAcks(
if len(seqs) == 0 {
src.log.Debug("Nothing to flush", zap.String("channel", k.ChannelID), zap.String("port", k.PortID))
+ if pp.metrics != nil {
+ pp.metrics.SetUnrelayedPackets(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, 0)
+ pp.metrics.SetUnrelayedAcks(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, 0)
+ }
return nil, nil
}
@@ -1214,6 +1218,10 @@ func (pp *PathProcessor) queuePendingRecvAndAcks(
return nil, err
}
+ if pp.metrics != nil {
+ pp.metrics.SetUnrelayedPackets(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, len(unrecv))
+ }
+
dstHeight := int64(dst.latestBlock.Height)
var order chantypes.Order
@@ -1327,6 +1335,10 @@ SeqLoop:
unacked = append(unacked, seq)
}
+ if pp.metrics != nil {
+ pp.metrics.SetUnrelayedAcks(pp.pathEnd1.info.PathName, src.info.ChainID, dst.info.ChainID, k.ChannelID, k.CounterpartyChannelID, len(unacked))
+ }
+
for i, seq := range unacked {
dstMu.Lock()
ck := k.Counterparty()