Skip to content

Commit

Permalink
Collect InfiniBand port state and physical state (prometheus#1357)
Browse files Browse the repository at this point in the history
Collect the InfiniBand port state, the physical state, and the maximum
signal transfer rate.

Signed-off-by: Benjamin Drung <[email protected]>
  • Loading branch information
bdrung authored and oblitorum committed Apr 9, 2024
1 parent f2756fc commit b99aec3
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 0 deletions.
15 changes: 15 additions & 0 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,11 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
# TYPE node_infiniband_multicast_packets_transmitted_total counter
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_physical_state_id Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest)
# TYPE node_infiniband_physical_state_id gauge
node_infiniband_physical_state_id{device="i40iw0",port="1"} 5
node_infiniband_physical_state_id{device="mlx4_0",port="1"} 5
node_infiniband_physical_state_id{device="mlx4_0",port="2"} 5
# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
# TYPE node_infiniband_port_constraint_errors_received_total counter
node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
Expand Down Expand Up @@ -872,6 +877,16 @@ node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.23586
# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
# TYPE node_infiniband_port_transmit_wait_total counter
node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
# HELP node_infiniband_rate_bytes_per_second Maximum signal transfer rate
# TYPE node_infiniband_rate_bytes_per_second gauge
node_infiniband_rate_bytes_per_second{device="i40iw0",port="1"} 1.25e+09
node_infiniband_rate_bytes_per_second{device="mlx4_0",port="1"} 5e+09
node_infiniband_rate_bytes_per_second{device="mlx4_0",port="2"} 5e+09
# HELP node_infiniband_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)
# TYPE node_infiniband_state_id gauge
node_infiniband_state_id{device="i40iw0",port="1"} 4
node_infiniband_state_id{device="mlx4_0",port="1"} 4
node_infiniband_state_id{device="mlx4_0",port="2"} 4
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
# TYPE node_infiniband_unicast_packets_received_total counter
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
Expand Down
15 changes: 15 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1112,6 +1112,11 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
# TYPE node_infiniband_multicast_packets_transmitted_total counter
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_physical_state_id Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest)
# TYPE node_infiniband_physical_state_id gauge
node_infiniband_physical_state_id{device="i40iw0",port="1"} 5
node_infiniband_physical_state_id{device="mlx4_0",port="1"} 5
node_infiniband_physical_state_id{device="mlx4_0",port="2"} 5
# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
# TYPE node_infiniband_port_constraint_errors_received_total counter
node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
Expand Down Expand Up @@ -1144,6 +1149,16 @@ node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.23586
# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
# TYPE node_infiniband_port_transmit_wait_total counter
node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
# HELP node_infiniband_rate_bytes_per_second Maximum signal transfer rate
# TYPE node_infiniband_rate_bytes_per_second gauge
node_infiniband_rate_bytes_per_second{device="i40iw0",port="1"} 1.25e+09
node_infiniband_rate_bytes_per_second{device="mlx4_0",port="1"} 5e+09
node_infiniband_rate_bytes_per_second{device="mlx4_0",port="2"} 5e+09
# HELP node_infiniband_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)
# TYPE node_infiniband_state_id gauge
node_infiniband_state_id{device="i40iw0",port="1"} 4
node_infiniband_state_id{device="mlx4_0",port="1"} 4
node_infiniband_state_id{device="mlx4_0",port="2"} 4
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
# TYPE node_infiniband_unicast_packets_received_total counter
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
Expand Down
7 changes: 7 additions & 0 deletions collector/infiniband_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func NewInfiniBandCollector() (Collector, error) {
"link_error_recovery_total": "Number of times the link successfully recovered from an error state",
"multicast_packets_received_total": "Number of multicast packets received (including errors)",
"multicast_packets_transmitted_total": "Number of multicast packets transmitted (including errors)",
"physical_state_id": "Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest)",
"port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded",
"port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port",
"port_data_received_bytes_total": "Number of data octets received on all links",
Expand All @@ -67,6 +68,8 @@ func NewInfiniBandCollector() (Collector, error) {
"port_packets_received_total": "Number of packets received on all VLs by this port (including errors)",
"port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)",
"port_transmit_wait_total": "Number of ticks during which the port had data to transmit but no data was sent during the entire tick",
"rate_bytes_per_second": "Maximum signal transfer rate",
"state_id": "State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)",
"unicast_packets_received_total": "Number of unicast packets received (including errors)",
"unicast_packets_transmitted_total": "Number of unicast packets transmitted (including errors)",
}
Expand Down Expand Up @@ -105,6 +108,10 @@ func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
for _, port := range device.Ports {
portStr := strconv.FormatUint(uint64(port.Port), 10)

c.pushMetric(ch, "state_id", uint64(port.StateID), port.Name, portStr, prometheus.GaugeValue)
c.pushMetric(ch, "physical_state_id", uint64(port.PhysStateID), port.Name, portStr, prometheus.GaugeValue)
c.pushMetric(ch, "rate_bytes_per_second", port.Rate, port.Name, portStr, prometheus.GaugeValue)

c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr)
c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr)
c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr)
Expand Down

0 comments on commit b99aec3

Please sign in to comment.