aboutsummaryrefslogtreecommitdiff
path: root/collector
diff options
context:
space:
mode:
authorBenjamin Drung <benjamin.drung@cloud.ionos.com>2019-09-23 18:18:35 +0200
committerJohannes 'fish' Ziemke <github@freigeist.org>2019-09-23 18:18:35 +0200
commit27b8c93a5afc21632239890c4558c7300cca17d2 (patch)
treeb435dabd830d058e50a4ed0df2255dea101fd5ee /collector
parentf3538e1fc6468ecaeb616fb74597330fb6190159 (diff)
downloadprometheus_node_collector-27b8c93a5afc21632239890c4558c7300cca17d2.tar.bz2
prometheus_node_collector-27b8c93a5afc21632239890c4558c7300cca17d2.tar.xz
prometheus_node_collector-27b8c93a5afc21632239890c4558c7300cca17d2.zip
Use InfiniBandClass from procfs library (#1396)
Parsing the sysfs files for InfiniBand was added to the procfs library (see https://github.com/prometheus/procfs/pull/164). Therefore use `InfiniBandClass` from the procfs library instead of parsing sysfs itself. If the port counter return `N/A (no PMA)` no metric will be returned (instead of returning 0 for this metric. Signed-off-by: Benjamin Drung <benjamin.drung@cloud.ionos.com>
Diffstat (limited to 'collector')
-rw-r--r--collector/fixtures/e2e-64k-page-output.txt11
-rw-r--r--collector/fixtures/e2e-output.txt11
-rw-r--r--collector/fixtures/sys.ttar75
-rw-r--r--collector/infiniband_linux.go262
-rw-r--r--collector/infiniband_linux_test.go40
5 files changed, 149 insertions, 250 deletions
diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt
index 6b2e6bd..4ac50a1 100644
--- a/collector/fixtures/e2e-64k-page-output.txt
+++ b/collector/fixtures/e2e-64k-page-output.txt
@@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
816node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 816node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
817# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down 817# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
818# TYPE node_infiniband_link_downed_total counter 818# TYPE node_infiniband_link_downed_total counter
819node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
820node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 819node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
821node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 820node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
822# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state 821# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
823# TYPE node_infiniband_link_error_recovery_total counter 822# TYPE node_infiniband_link_error_recovery_total counter
824node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
825node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 823node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
826node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 824node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
827# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) 825# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
834node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 832node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
835# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded 833# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
836# TYPE node_infiniband_port_constraint_errors_received_total counter 834# TYPE node_infiniband_port_constraint_errors_received_total counter
837node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0
838node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 835node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
839# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port 836# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port
840# TYPE node_infiniband_port_constraint_errors_transmitted_total counter 837# TYPE node_infiniband_port_constraint_errors_transmitted_total counter
841node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0
842node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 838node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0
843# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links 839# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
844# TYPE node_infiniband_port_data_received_bytes_total counter 840# TYPE node_infiniband_port_data_received_bytes_total counter
845node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
846node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 841node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
847node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 842node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
848# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links 843# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
849# TYPE node_infiniband_port_data_transmitted_bytes_total counter 844# TYPE node_infiniband_port_data_transmitted_bytes_total counter
850node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
851node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 845node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
852node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 846node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
853# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested 847# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested
@@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
855node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 849node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0
856# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested 850# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested
857# TYPE node_infiniband_port_discards_transmitted_total counter 851# TYPE node_infiniband_port_discards_transmitted_total counter
858node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0
859node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 852node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5
860# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port 853# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port
861# TYPE node_infiniband_port_errors_received_total counter 854# TYPE node_infiniband_port_errors_received_total counter
862node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0
863node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 855node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0
864# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) 856# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors)
865# TYPE node_infiniband_port_packets_received_total counter 857# TYPE node_infiniband_port_packets_received_total counter
866node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0
867node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 858node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09
868# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) 859# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors)
869# TYPE node_infiniband_port_packets_transmitted_total counter 860# TYPE node_infiniband_port_packets_transmitted_total counter
870node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0
871node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 861node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06
872# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick 862# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
873# TYPE node_infiniband_port_transmit_wait_total counter 863# TYPE node_infiniband_port_transmit_wait_total counter
874node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0
875node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 864node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
876# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) 865# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
877# TYPE node_infiniband_unicast_packets_received_total counter 866# TYPE node_infiniband_unicast_packets_received_total counter
diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt
index 7651f53..607d250 100644
--- a/collector/fixtures/e2e-output.txt
+++ b/collector/fixtures/e2e-output.txt
@@ -816,12 +816,10 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
816node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 816node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
817# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down 817# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
818# TYPE node_infiniband_link_downed_total counter 818# TYPE node_infiniband_link_downed_total counter
819node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
820node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 819node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
821node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 820node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
822# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state 821# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
823# TYPE node_infiniband_link_error_recovery_total counter 822# TYPE node_infiniband_link_error_recovery_total counter
824node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
825node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 823node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
826node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 824node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
827# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) 825# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@@ -834,20 +832,16 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
834node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 832node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
835# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded 833# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded
836# TYPE node_infiniband_port_constraint_errors_received_total counter 834# TYPE node_infiniband_port_constraint_errors_received_total counter
837node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0
838node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 835node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0
839# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port 836# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port
840# TYPE node_infiniband_port_constraint_errors_transmitted_total counter 837# TYPE node_infiniband_port_constraint_errors_transmitted_total counter
841node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0
842node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 838node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0
843# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links 839# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
844# TYPE node_infiniband_port_data_received_bytes_total counter 840# TYPE node_infiniband_port_data_received_bytes_total counter
845node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
846node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 841node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
847node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 842node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
848# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links 843# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
849# TYPE node_infiniband_port_data_transmitted_bytes_total counter 844# TYPE node_infiniband_port_data_transmitted_bytes_total counter
850node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
851node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 845node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
852node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 846node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
853# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested 847# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested
@@ -855,23 +849,18 @@ node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
855node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 849node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0
856# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested 850# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested
857# TYPE node_infiniband_port_discards_transmitted_total counter 851# TYPE node_infiniband_port_discards_transmitted_total counter
858node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0
859node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 852node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5
860# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port 853# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port
861# TYPE node_infiniband_port_errors_received_total counter 854# TYPE node_infiniband_port_errors_received_total counter
862node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0
863node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 855node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0
864# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) 856# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors)
865# TYPE node_infiniband_port_packets_received_total counter 857# TYPE node_infiniband_port_packets_received_total counter
866node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0
867node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 858node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09
868# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) 859# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors)
869# TYPE node_infiniband_port_packets_transmitted_total counter 860# TYPE node_infiniband_port_packets_transmitted_total counter
870node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0
871node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 861node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06
872# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick 862# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick
873# TYPE node_infiniband_port_transmit_wait_total counter 863# TYPE node_infiniband_port_transmit_wait_total counter
874node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0
875node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 864node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09
876# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) 865# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
877# TYPE node_infiniband_unicast_packets_received_total counter 866# TYPE node_infiniband_unicast_packets_received_total counter
diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar
index 41e062d..8502ec6 100644
--- a/collector/fixtures/sys.ttar
+++ b/collector/fixtures/sys.ttar
@@ -112,6 +112,21 @@ Mode: 755
112Directory: sys/class/infiniband/i40iw0 112Directory: sys/class/infiniband/i40iw0
113Mode: 755 113Mode: 755
114# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 114# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
115Path: sys/class/infiniband/i40iw0/board_id
116Lines: 1
117I40IW Board ID
118Mode: 644
119# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
120Path: sys/class/infiniband/i40iw0/fw_ver
121Lines: 1
1220.2
123Mode: 644
124# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
125Path: sys/class/infiniband/i40iw0/hca_type
126Lines: 1
127I40IW
128Mode: 644
129# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
115Directory: sys/class/infiniband/i40iw0/ports 130Directory: sys/class/infiniband/i40iw0/ports
116Mode: 755 131Mode: 755
117# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 132# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -206,9 +221,39 @@ Lines: 1
206N/A (no PMA) 221N/A (no PMA)
207Mode: 644 222Mode: 644
208# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 223# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
224Path: sys/class/infiniband/i40iw0/ports/1/phys_state
225Lines: 1
2265: LinkUp
227Mode: 644
228# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
229Path: sys/class/infiniband/i40iw0/ports/1/rate
230Lines: 1
23110 Gb/sec (4X)
232Mode: 644
233# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
234Path: sys/class/infiniband/i40iw0/ports/1/state
235Lines: 1
2364: ACTIVE
237Mode: 644
238# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
209Directory: sys/class/infiniband/mlx4_0 239Directory: sys/class/infiniband/mlx4_0
210Mode: 755 240Mode: 755
211# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 241# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
242Path: sys/class/infiniband/mlx4_0/board_id
243Lines: 1
244SM_1141000001000
245Mode: 644
246# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
247Path: sys/class/infiniband/mlx4_0/fw_ver
248Lines: 1
2492.31.5050
250Mode: 644
251# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
252Path: sys/class/infiniband/mlx4_0/hca_type
253Lines: 1
254MT4099
255Mode: 644
256# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
212Directory: sys/class/infiniband/mlx4_0/ports 257Directory: sys/class/infiniband/mlx4_0/ports
213Mode: 755 258Mode: 755
214# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 259# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -346,6 +391,21 @@ Lines: 1
3460 3910
347Mode: 644 392Mode: 644
348# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 393# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
394Path: sys/class/infiniband/mlx4_0/ports/1/phys_state
395Lines: 1
3965: LinkUp
397Mode: 644
398# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
399Path: sys/class/infiniband/mlx4_0/ports/1/rate
400Lines: 1
40140 Gb/sec (4X QDR)
402Mode: 644
403# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
404Path: sys/class/infiniband/mlx4_0/ports/1/state
405Lines: 1
4064: ACTIVE
407Mode: 644
408# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
349Directory: sys/class/infiniband/mlx4_0/ports/2 409Directory: sys/class/infiniband/mlx4_0/ports/2
350Mode: 755 410Mode: 755
351# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 411# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -435,6 +495,21 @@ Lines: 1
4350 4950
436Mode: 644 496Mode: 644
437# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 497# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
498Path: sys/class/infiniband/mlx4_0/ports/2/phys_state
499Lines: 1
5005: LinkUp
501Mode: 644
502# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
503Path: sys/class/infiniband/mlx4_0/ports/2/rate
504Lines: 1
50540 Gb/sec (4X QDR)
506Mode: 644
507# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
508Path: sys/class/infiniband/mlx4_0/ports/2/state
509Lines: 1
5104: ACTIVE
511Mode: 644
512# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
438Directory: sys/class/net 513Directory: sys/class/net
439Mode: 755 514Mode: 755
440# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 515# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go
index 16134f3..237a913 100644
--- a/collector/infiniband_linux.go
+++ b/collector/infiniband_linux.go
@@ -1,4 +1,4 @@
1// Copyright 2017 The Prometheus Authors 1// Copyright 2017-2019 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License"); 2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License. 3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at 4// You may obtain a copy of the License at
@@ -17,31 +17,16 @@
17package collector 17package collector
18 18
19import ( 19import (
20 "errors" 20 "fmt"
21 "os" 21 "strconv"
22 "path/filepath"
23 "strings"
24 22
25 "github.com/prometheus/client_golang/prometheus" 23 "github.com/prometheus/client_golang/prometheus"
26 "github.com/prometheus/common/log" 24 "github.com/prometheus/procfs/sysfs"
27)
28
29const infinibandPath = "class/infiniband"
30
31var (
32 errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
33 errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
34) 25)
35 26
36type infinibandCollector struct { 27type infinibandCollector struct {
37 metricDescs map[string]*prometheus.Desc 28 fs sysfs.FS
38 counters map[string]infinibandMetric 29 metricDescs map[string]*prometheus.Desc
39 legacyCounters map[string]infinibandMetric
40}
41
42type infinibandMetric struct {
43 File string
44 Help string
45} 30}
46 31
47func init() { 32func init() {
@@ -51,55 +36,47 @@ func init() {
51// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats. 36// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
52func NewInfiniBandCollector() (Collector, error) { 37func NewInfiniBandCollector() (Collector, error) {
53 var i infinibandCollector 38 var i infinibandCollector
39 var err error
54 40
55 // Filenames of all InfiniBand counter metrics including a detailed description. 41 i.fs, err = sysfs.NewFS(*sysPath)
56 i.counters = map[string]infinibandMetric{ 42 if err != nil {
57 "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, 43 return nil, fmt.Errorf("failed to open sysfs: %v", err)
58 "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
59 "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
60 "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
61 "port_constraint_errors_received_total": {"port_rcv_constraint_errors", "Number of packets received on the switch physical port that are discarded"},
62 "port_constraint_errors_transmitted_total": {"port_xmit_constraint_errors", "Number of packets not transmitted from the switch physical port"},
63 "port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"},
64 "port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"},
65 "port_discards_received_total": {"port_rcv_discards", "Number of inbound packets discarded by the port because the port is down or congested"},
66 "port_discards_transmitted_total": {"port_xmit_discards", "Number of outbound packets discarded by the port because the port is down or congested"},
67 "port_errors_received_total": {"port_rcv_errors", "Number of packets containing an error that were received on this port"},
68 "port_packets_received_total": {"port_rcv_packets", "Number of packets received on all VLs by this port (including errors)"},
69 "port_packets_transmitted_total": {"port_xmit_packets", "Number of packets transmitted on all VLs from this port (including errors)"},
70 "port_transmit_wait_total": {"port_xmit_wait", "Number of ticks during which the port had data to transmit but no data was sent during the entire tick"},
71 "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
72 "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
73 } 44 }
74 45
75 // Deprecated counters for some older versions of InfiniBand drivers. 46 // Detailed description for all metrics.
76 i.legacyCounters = map[string]infinibandMetric{ 47 descriptions := map[string]string{
77 "legacy_multicast_packets_received_total": {"port_multicast_rcv_packets", "Number of multicast packets received"}, 48 "legacy_multicast_packets_received_total": "Number of multicast packets received",
78 "legacy_multicast_packets_transmitted_total": {"port_multicast_xmit_packets", "Number of multicast packets transmitted"}, 49 "legacy_multicast_packets_transmitted_total": "Number of multicast packets transmitted",
79 "legacy_data_received_bytes_total": {"port_rcv_data_64", "Number of data octets received on all links"}, 50 "legacy_data_received_bytes_total": "Number of data octets received on all links",
80 "legacy_packets_received_total": {"port_rcv_packets_64", "Number of data packets received on all links"}, 51 "legacy_packets_received_total": "Number of data packets received on all links",
81 "legacy_unicast_packets_received_total": {"port_unicast_rcv_packets", "Number of unicast packets received"}, 52 "legacy_unicast_packets_received_total": "Number of unicast packets received",
82 "legacy_unicast_packets_transmitted_total": {"port_unicast_xmit_packets", "Number of unicast packets transmitted"}, 53 "legacy_unicast_packets_transmitted_total": "Number of unicast packets transmitted",
83 "legacy_data_transmitted_bytes_total": {"port_xmit_data_64", "Number of data octets transmitted on all links"}, 54 "legacy_data_transmitted_bytes_total": "Number of data octets transmitted on all links",
84 "legacy_packets_transmitted_total": {"port_xmit_packets_64", "Number of data packets received on all links"}, 55 "legacy_packets_transmitted_total": "Number of data packets received on all links",
56 "link_downed_total": "Number of times the link failed to recover from an error state and went down",
57 "link_error_recovery_total": "Number of times the link successfully recovered from an error state",
58 "multicast_packets_received_total": "Number of multicast packets received (including errors)",
59 "multicast_packets_transmitted_total": "Number of multicast packets transmitted (including errors)",
60 "port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded",
61 "port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port",
62 "port_data_received_bytes_total": "Number of data octets received on all links",
63 "port_data_transmitted_bytes_total": "Number of data octets transmitted on all links",
64 "port_discards_received_total": "Number of inbound packets discarded by the port because the port is down or congested",
65 "port_discards_transmitted_total": "Number of outbound packets discarded by the port because the port is down or congested",
66 "port_errors_received_total": "Number of packets containing an error that were received on this port",
67 "port_packets_received_total": "Number of packets received on all VLs by this port (including errors)",
68 "port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)",
69 "port_transmit_wait_total": "Number of ticks during which the port had data to transmit but no data was sent during the entire tick",
70 "unicast_packets_received_total": "Number of unicast packets received (including errors)",
71 "unicast_packets_transmitted_total": "Number of unicast packets transmitted (including errors)",
85 } 72 }
86 73
87 subsystem := "infiniband"
88 i.metricDescs = make(map[string]*prometheus.Desc) 74 i.metricDescs = make(map[string]*prometheus.Desc)
89 75
90 for metricName, infinibandMetric := range i.counters { 76 for metricName, description := range descriptions {
91 i.metricDescs[metricName] = prometheus.NewDesc(
92 prometheus.BuildFQName(namespace, subsystem, metricName),
93 infinibandMetric.Help,
94 []string{"device", "port"},
95 nil,
96 )
97 }
98
99 for metricName, infinibandMetric := range i.legacyCounters {
100 i.metricDescs[metricName] = prometheus.NewDesc( 77 i.metricDescs[metricName] = prometheus.NewDesc(
101 prometheus.BuildFQName(namespace, subsystem, metricName), 78 prometheus.BuildFQName(namespace, "infiniband", metricName),
102 infinibandMetric.Help, 79 description,
103 []string{"device", "port"}, 80 []string{"device", "port"},
104 nil, 81 nil,
105 ) 82 )
@@ -108,141 +85,50 @@ func NewInfiniBandCollector() (Collector, error) {
108 return &i, nil 85 return &i, nil
109} 86}
110 87
111// infinibandDevices retrieves a list of InfiniBand devices. 88func (c *infinibandCollector) pushMetric(ch chan<- prometheus.Metric, name string, value uint64, deviceName string, port string, valueType prometheus.ValueType) {
112func infinibandDevices(infinibandPath string) ([]string, error) { 89 ch <- prometheus.MustNewConstMetric(c.metricDescs[name], valueType, float64(value), deviceName, port)
113 devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
114 if err != nil {
115 return nil, err
116 }
117
118 if len(devices) < 1 {
119 log.Debugf("Unable to detect InfiniBand devices")
120 err = errInfinibandNoDevicesFound
121 return nil, err
122 }
123
124 // Extract just the filenames which equate to the device names.
125 for i, device := range devices {
126 devices[i] = filepath.Base(device)
127 }
128
129 return devices, nil
130}
131
132// Retrieve a list of ports for the InfiniBand device.
133func infinibandPorts(infinibandPath, device string) ([]string, error) {
134 ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
135 if err != nil {
136 return nil, err
137 }
138
139 if len(ports) < 1 {
140 log.Debugf("Unable to detect ports for %s", device)
141 err = errInfinibandNoPortsFound
142 return nil, err
143 }
144
145 // Extract just the filenames which equates to the port numbers.
146 for i, port := range ports {
147 ports[i] = filepath.Base(port)
148 }
149
150 return ports, nil
151} 90}
152 91
153func readMetric(directory, metricFile string) (uint64, error) { 92func (c *infinibandCollector) pushCounter(ch chan<- prometheus.Metric, name string, value *uint64, deviceName string, port string) {
154 metric, err := readUintFromFile(filepath.Join(directory, metricFile)) 93 if value != nil {
155 if err != nil { 94 c.pushMetric(ch, name, *value, deviceName, port, prometheus.CounterValue)
156 // Ugly workaround for handling #966, when counters are
157 // `N/A (not available)`.
158 // This was already patched and submitted, see
159 // https://www.spinics.net/lists/linux-rdma/msg68596.html
160 // Remove this as soon as the fix lands in the enterprise distros.
161 if strings.Contains(err.Error(), "N/A (no PMA)") {
162 log.Debugf("%q value is N/A", metricFile)
163 return 0, nil
164 }
165 log.Debugf("Error reading %q file", metricFile)
166 return 0, err
167 }
168
169 // According to Mellanox, the following metrics "are divided by 4 unconditionally"
170 // as they represent the amount of data being transmitted and received per lane.
171 // Mellanox cards have 4 lanes per port, so all values must be multiplied by 4
172 // to get the expected value.
173 switch metricFile {
174 case "port_rcv_data", "port_xmit_data", "port_rcv_data_64", "port_xmit_data_64":
175 metric *= 4
176 } 95 }
177
178 return metric, nil
179} 96}
180 97
181func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error { 98func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error {
182 devices, err := infinibandDevices(sysFilePath(infinibandPath)) 99 devices, err := c.fs.InfiniBandClass()
183 100 if err != nil {
184 // If no devices are found or another error is raised while attempting to find devices, 101 return fmt.Errorf("error obtaining InfiniBand class info: %s", err)
185 // InfiniBand is likely not installed and the collector should be skipped.
186 switch err {
187 case nil:
188 case errInfinibandNoDevicesFound:
189 return nil
190 default:
191 return err
192 } 102 }
193 103
194 for _, device := range devices { 104 for _, device := range devices {
195 ports, err := infinibandPorts(sysFilePath(infinibandPath), device) 105 for _, port := range device.Ports {
196 106 portStr := strconv.FormatUint(uint64(port.Port), 10)
197 // If no ports are found for the specified device, skip to the next device. 107
198 switch err { 108 c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr)
199 case nil: 109 c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr)
200 case errInfinibandNoPortsFound: 110 c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr)
201 continue 111 c.pushCounter(ch, "legacy_packets_received_total", port.Counters.LegacyPortRcvPackets64, port.Name, portStr)
202 default: 112 c.pushCounter(ch, "legacy_unicast_packets_received_total", port.Counters.LegacyPortUnicastRcvPackets, port.Name, portStr)
203 return err 113 c.pushCounter(ch, "legacy_unicast_packets_transmitted_total", port.Counters.LegacyPortUnicastXmitPackets, port.Name, portStr)
204 } 114 c.pushCounter(ch, "legacy_data_transmitted_bytes_total", port.Counters.LegacyPortXmitData64, port.Name, portStr)
205 115 c.pushCounter(ch, "legacy_packets_transmitted_total", port.Counters.LegacyPortXmitPackets64, port.Name, portStr)
206 for _, port := range ports { 116 c.pushCounter(ch, "link_downed_total", port.Counters.LinkDowned, port.Name, portStr)
207 portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port)) 117 c.pushCounter(ch, "link_error_recovery_total", port.Counters.LinkErrorRecovery, port.Name, portStr)
208 118 c.pushCounter(ch, "multicast_packets_received_total", port.Counters.MulticastRcvPackets, port.Name, portStr)
209 // Add metrics for the InfiniBand counters. 119 c.pushCounter(ch, "multicast_packets_transmitted_total", port.Counters.MulticastXmitPackets, port.Name, portStr)
210 for metricName, infinibandMetric := range c.counters { 120 c.pushCounter(ch, "port_constraint_errors_received_total", port.Counters.PortRcvConstraintErrors, port.Name, portStr)
211 if _, err := os.Stat(filepath.Join(portFiles, "counters", infinibandMetric.File)); os.IsNotExist(err) { 121 c.pushCounter(ch, "port_constraint_errors_transmitted_total", port.Counters.PortXmitConstraintErrors, port.Name, portStr)
212 continue 122 c.pushCounter(ch, "port_data_received_bytes_total", port.Counters.PortRcvData, port.Name, portStr)
213 } 123 c.pushCounter(ch, "port_data_transmitted_bytes_total", port.Counters.PortXmitData, port.Name, portStr)
214 metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File) 124 c.pushCounter(ch, "port_discards_received_total", port.Counters.PortRcvDiscards, port.Name, portStr)
215 if err != nil { 125 c.pushCounter(ch, "port_discards_transmitted_total", port.Counters.PortXmitDiscards, port.Name, portStr)
216 return err 126 c.pushCounter(ch, "port_errors_received_total", port.Counters.PortRcvErrors, port.Name, portStr)
217 } 127 c.pushCounter(ch, "port_packets_received_total", port.Counters.PortRcvPackets, port.Name, portStr)
218 128 c.pushCounter(ch, "port_packets_transmitted_total", port.Counters.PortXmitPackets, port.Name, portStr)
219 ch <- prometheus.MustNewConstMetric( 129 c.pushCounter(ch, "port_transmit_wait_total", port.Counters.PortXmitWait, port.Name, portStr)
220 c.metricDescs[metricName], 130 c.pushCounter(ch, "unicast_packets_received_total", port.Counters.UnicastRcvPackets, port.Name, portStr)
221 prometheus.CounterValue, 131 c.pushCounter(ch, "unicast_packets_transmitted_total", port.Counters.UnicastXmitPackets, port.Name, portStr)
222 float64(metric),
223 device,
224 port,
225 )
226 }
227
228 // Add metrics for the legacy InfiniBand counters.
229 for metricName, infinibandMetric := range c.legacyCounters {
230 if _, err := os.Stat(filepath.Join(portFiles, "counters_ext", infinibandMetric.File)); os.IsNotExist(err) {
231 continue
232 }
233 metric, err := readMetric(filepath.Join(portFiles, "counters_ext"), infinibandMetric.File)
234 if err != nil {
235 return err
236 }
237
238 ch <- prometheus.MustNewConstMetric(
239 c.metricDescs[metricName],
240 prometheus.CounterValue,
241 float64(metric),
242 device,
243 port,
244 )
245 }
246 } 132 }
247 } 133 }
248 134
diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go
deleted file mode 100644
index d2090f8..0000000
--- a/collector/infiniband_linux_test.go
+++ /dev/null
@@ -1,40 +0,0 @@
1// Copyright 2017 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14package collector
15
16import (
17 "testing"
18)
19
20func TestInfiniBandDevices(t *testing.T) {
21 devices, err := infinibandDevices("fixtures/sys/class/infiniband")
22 if err != nil {
23 t.Fatal(err)
24 }
25
26 if l := len(devices); l != 2 {
27 t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l)
28 }
29}
30
31func TestInfiniBandPorts(t *testing.T) {
32 ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
33 if err != nil {
34 t.Fatal(err)
35 }
36
37 if l := len(ports); l != 2 {
38 t.Fatalf("Retrieved an unexpected number of InfiniBand ports: %d", l)
39 }
40}