aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMario Trangoni <mario@mariotrangoni.de>2018-10-04 15:05:59 +0200
committerBen Kochie <superq@gmail.com>2018-10-04 15:05:59 +0200
commit3659260b66e304ca5692354493c6828764c56897 (patch)
treed771a4d1142e1f230a1fbcc158f9dd6db2f9eaea
parent0f9842f20a036d2db8f29102e6bc0cd7b1672865 (diff)
downloadprometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.tar.bz2
prometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.tar.xz
prometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.zip
infiniband: Handle iWARP* RDMA modules N/A (#974)
* infiniband: Add not connected i40iw0/ports/1 fixtures * infiniband: Handle issue when iWARP* RDMA modules are not available This is related to #966, and handle this error, Jun 07 13:33:24 hostname node_exporter[81888]: time="2018-06-07T13:33:24+02:00" level=error msg="ERROR: infiniband collector failed after 0.000929s: strconv.ParseUint: parsing \"N/A (no PMA)\": invalid syntax" source="collector.go:132" Signed-off-by: Mario Trangoni <mjtrangoni@gmail.com>
-rw-r--r--collector/fixtures/e2e-64k-page-output.txt4
-rw-r--r--collector/fixtures/e2e-output.txt4
-rw-r--r--collector/fixtures/sys.ttar97
-rw-r--r--collector/infiniband_linux.go10
-rw-r--r--collector/infiniband_linux_test.go2
5 files changed, 116 insertions, 1 deletions
diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt
index edfb373..f694567 100644
--- a/collector/fixtures/e2e-64k-page-output.txt
+++ b/collector/fixtures/e2e-64k-page-output.txt
@@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
787node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 787node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
788# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down 788# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
789# TYPE node_infiniband_link_downed_total counter 789# TYPE node_infiniband_link_downed_total counter
790node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
790node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 791node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
791node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 792node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
792# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state 793# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
793# TYPE node_infiniband_link_error_recovery_total counter 794# TYPE node_infiniband_link_error_recovery_total counter
795node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
794node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 796node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
795node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 797node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
796# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) 798# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
803node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 805node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
804# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links 806# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
805# TYPE node_infiniband_port_data_received_bytes_total counter 807# TYPE node_infiniband_port_data_received_bytes_total counter
808node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
806node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 809node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
807node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 810node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
808# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links 811# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
809# TYPE node_infiniband_port_data_transmitted_bytes_total counter 812# TYPE node_infiniband_port_data_transmitted_bytes_total counter
813node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
810node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 814node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
811node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 815node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
812# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) 816# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt
index 9e24ada..5f0b8ac 100644
--- a/collector/fixtures/e2e-output.txt
+++ b/collector/fixtures/e2e-output.txt
@@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
787node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 787node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
788# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down 788# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
789# TYPE node_infiniband_link_downed_total counter 789# TYPE node_infiniband_link_downed_total counter
790node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
790node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 791node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
791node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 792node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
792# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state 793# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
793# TYPE node_infiniband_link_error_recovery_total counter 794# TYPE node_infiniband_link_error_recovery_total counter
795node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
794node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 796node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
795node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 797node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
796# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) 798# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
803node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 805node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
804# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links 806# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
805# TYPE node_infiniband_port_data_received_bytes_total counter 807# TYPE node_infiniband_port_data_received_bytes_total counter
808node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
806node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 809node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
807node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 810node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
808# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links 811# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
809# TYPE node_infiniband_port_data_transmitted_bytes_total counter 812# TYPE node_infiniband_port_data_transmitted_bytes_total counter
813node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
810node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 814node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
811node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 815node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
812# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) 816# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar
index d8982ed..b8e4546 100644
--- a/collector/fixtures/sys.ttar
+++ b/collector/fixtures/sys.ttar
@@ -109,6 +109,103 @@ Mode: 644
109Directory: sys/class/infiniband 109Directory: sys/class/infiniband
110Mode: 755 110Mode: 755
111# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 111# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
112Directory: sys/class/infiniband/i40iw0
113Mode: 755
114# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
115Directory: sys/class/infiniband/i40iw0/ports
116Mode: 755
117# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
118Directory: sys/class/infiniband/i40iw0/ports/1
119Mode: 755
120# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
121Directory: sys/class/infiniband/i40iw0/ports/1/counters
122Mode: 755
123# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
124Path: sys/class/infiniband/i40iw0/ports/1/counters/excessive_buffer_overrun_errors
125Lines: 1
126N/A (no PMA)
127Mode: 644
128# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
129Path: sys/class/infiniband/i40iw0/ports/1/counters/link_downed
130Lines: 1
131N/A (no PMA)
132Mode: 644
133# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
134Path: sys/class/infiniband/i40iw0/ports/1/counters/link_error_recovery
135Lines: 1
136N/A (no PMA)
137Mode: 644
138# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
139Path: sys/class/infiniband/i40iw0/ports/1/counters/local_link_integrity_errors
140Lines: 1
141N/A (no PMA)
142Mode: 644
143# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
144Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_constraint_errors
145Lines: 1
146N/A (no PMA)
147Mode: 644
148# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
149Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_data
150Lines: 1
151N/A (no PMA)
152Mode: 644
153# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
154Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_errors
155Lines: 1
156N/A (no PMA)
157Mode: 644
158# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
159Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_packets
160Lines: 1
161N/A (no PMA)
162Mode: 644
163# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
164Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_remote_physical_errors
165Lines: 1
166N/A (no PMA)
167Mode: 644
168# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
169Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_switch_relay_errors
170Lines: 1
171N/A (no PMA)
172Mode: 644
173# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
174Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_constraint_errors
175Lines: 1
176N/A (no PMA)
177Mode: 644
178# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
179Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_data
180Lines: 1
181N/A (no PMA)
182Mode: 644
183# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
184Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_discards
185Lines: 1
186N/A (no PMA)
187Mode: 644
188# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
189Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_packets
190Lines: 1
191N/A (no PMA)
192Mode: 644
193# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
194Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_wait
195Lines: 1
196N/A (no PMA)
197Mode: 644
198# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
199Path: sys/class/infiniband/i40iw0/ports/1/counters/symbol_error
200Lines: 1
201N/A (no PMA)
202Mode: 644
203# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
204Path: sys/class/infiniband/i40iw0/ports/1/counters/VL15_dropped
205Lines: 1
206N/A (no PMA)
207Mode: 644
208# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
112Directory: sys/class/infiniband/mlx4_0 209Directory: sys/class/infiniband/mlx4_0
113Mode: 755 210Mode: 755
114# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 211# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go
index 7c21976..660e4bd 100644
--- a/collector/infiniband_linux.go
+++ b/collector/infiniband_linux.go
@@ -20,6 +20,7 @@ import (
20 "errors" 20 "errors"
21 "os" 21 "os"
22 "path/filepath" 22 "path/filepath"
23 "strings"
23 24
24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/prometheus/client_golang/prometheus"
25 "github.com/prometheus/common/log" 26 "github.com/prometheus/common/log"
@@ -144,6 +145,15 @@ func infinibandPorts(infinibandPath, device string) ([]string, error) {
144func readMetric(directory, metricFile string) (uint64, error) { 145func readMetric(directory, metricFile string) (uint64, error) {
145 metric, err := readUintFromFile(filepath.Join(directory, metricFile)) 146 metric, err := readUintFromFile(filepath.Join(directory, metricFile))
146 if err != nil { 147 if err != nil {
148 // Ugly workaround for handling #966, when counters are
149 // `N/A (not available)`.
150 // This was already patched and submitted, see
151 // https://www.spinics.net/lists/linux-rdma/msg68596.html
152 // Remove this as soon as the fix lands in the enterprise distros.
153 if strings.Contains(err.Error(), "N/A (no PMA)") {
154 log.Debugf("%q value is N/A", metricFile)
155 return 0, nil
156 }
147 log.Debugf("Error reading %q file", metricFile) 157 log.Debugf("Error reading %q file", metricFile)
148 return 0, err 158 return 0, err
149 } 159 }
diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go
index 68370c0..d2090f8 100644
--- a/collector/infiniband_linux_test.go
+++ b/collector/infiniband_linux_test.go
@@ -23,7 +23,7 @@ func TestInfiniBandDevices(t *testing.T) {
23 t.Fatal(err) 23 t.Fatal(err)
24 } 24 }
25 25
26 if l := len(devices); l != 1 { 26 if l := len(devices); l != 2 {
27 t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) 27 t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l)
28 } 28 }
29} 29}