diff options
author | Mario Trangoni <mario@mariotrangoni.de> | 2018-10-04 15:05:59 +0200 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2018-10-04 15:05:59 +0200 |
commit | 3659260b66e304ca5692354493c6828764c56897 (patch) | |
tree | d771a4d1142e1f230a1fbcc158f9dd6db2f9eaea | |
parent | 0f9842f20a036d2db8f29102e6bc0cd7b1672865 (diff) | |
download | prometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.tar.bz2 prometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.tar.xz prometheus_node_collector-3659260b66e304ca5692354493c6828764c56897.zip |
infiniband: Handle iWARP* RDMA modules N/A (#974)
* infiniband: Add not connected i40iw0/ports/1 fixtures
* infiniband: Handle issue when iWARP* RDMA modules are not available
This is related to #966, and handle this error,
Jun 07 13:33:24 hostname node_exporter[81888]: time="2018-06-07T13:33:24+02:00" level=error msg="ERROR: infiniband
collector failed after 0.000929s: strconv.ParseUint: parsing \"N/A (no PMA)\": invalid syntax" source="collector.go:132"
Signed-off-by: Mario Trangoni <mjtrangoni@gmail.com>
-rw-r--r-- | collector/fixtures/e2e-64k-page-output.txt | 4 | ||||
-rw-r--r-- | collector/fixtures/e2e-output.txt | 4 | ||||
-rw-r--r-- | collector/fixtures/sys.ttar | 97 | ||||
-rw-r--r-- | collector/infiniband_linux.go | 10 | ||||
-rw-r--r-- | collector/infiniband_linux_test.go | 2 |
5 files changed, 116 insertions, 1 deletions
diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index edfb373..f694567 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt | |||
@@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 | |||
787 | node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 | 787 | node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 |
788 | # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down | 788 | # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down |
789 | # TYPE node_infiniband_link_downed_total counter | 789 | # TYPE node_infiniband_link_downed_total counter |
790 | node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 | ||
790 | node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 | 791 | node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 |
791 | node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 | 792 | node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 |
792 | # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state | 793 | # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state |
793 | # TYPE node_infiniband_link_error_recovery_total counter | 794 | # TYPE node_infiniband_link_error_recovery_total counter |
795 | node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 | ||
794 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 | 796 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 |
795 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 | 797 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 |
796 | # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) | 798 | # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) |
@@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 | |||
803 | node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 | 805 | node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 |
804 | # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links | 806 | # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links |
805 | # TYPE node_infiniband_port_data_received_bytes_total counter | 807 | # TYPE node_infiniband_port_data_received_bytes_total counter |
808 | node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 | ||
806 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 | 809 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 |
807 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 | 810 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 |
808 | # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links | 811 | # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links |
809 | # TYPE node_infiniband_port_data_transmitted_bytes_total counter | 812 | # TYPE node_infiniband_port_data_transmitted_bytes_total counter |
813 | node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 | ||
810 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 | 814 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 |
811 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 | 815 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 |
812 | # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) | 816 | # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) |
diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 9e24ada..5f0b8ac 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt | |||
@@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 | |||
787 | node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 | 787 | node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 |
788 | # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down | 788 | # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down |
789 | # TYPE node_infiniband_link_downed_total counter | 789 | # TYPE node_infiniband_link_downed_total counter |
790 | node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 | ||
790 | node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 | 791 | node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 |
791 | node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 | 792 | node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 |
792 | # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state | 793 | # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state |
793 | # TYPE node_infiniband_link_error_recovery_total counter | 794 | # TYPE node_infiniband_link_error_recovery_total counter |
795 | node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 | ||
794 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 | 796 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 |
795 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 | 797 | node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 |
796 | # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) | 798 | # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) |
@@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 | |||
803 | node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 | 805 | node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 |
804 | # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links | 806 | # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links |
805 | # TYPE node_infiniband_port_data_received_bytes_total counter | 807 | # TYPE node_infiniband_port_data_received_bytes_total counter |
808 | node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 | ||
806 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 | 809 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 |
807 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 | 810 | node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 |
808 | # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links | 811 | # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links |
809 | # TYPE node_infiniband_port_data_transmitted_bytes_total counter | 812 | # TYPE node_infiniband_port_data_transmitted_bytes_total counter |
813 | node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 | ||
810 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 | 814 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 |
811 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 | 815 | node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 |
812 | # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) | 816 | # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) |
diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index d8982ed..b8e4546 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar | |||
@@ -109,6 +109,103 @@ Mode: 644 | |||
109 | Directory: sys/class/infiniband | 109 | Directory: sys/class/infiniband |
110 | Mode: 755 | 110 | Mode: 755 |
111 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | 111 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
112 | Directory: sys/class/infiniband/i40iw0 | ||
113 | Mode: 755 | ||
114 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
115 | Directory: sys/class/infiniband/i40iw0/ports | ||
116 | Mode: 755 | ||
117 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
118 | Directory: sys/class/infiniband/i40iw0/ports/1 | ||
119 | Mode: 755 | ||
120 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
121 | Directory: sys/class/infiniband/i40iw0/ports/1/counters | ||
122 | Mode: 755 | ||
123 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
124 | Path: sys/class/infiniband/i40iw0/ports/1/counters/excessive_buffer_overrun_errors | ||
125 | Lines: 1 | ||
126 | N/A (no PMA) | ||
127 | Mode: 644 | ||
128 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
129 | Path: sys/class/infiniband/i40iw0/ports/1/counters/link_downed | ||
130 | Lines: 1 | ||
131 | N/A (no PMA) | ||
132 | Mode: 644 | ||
133 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
134 | Path: sys/class/infiniband/i40iw0/ports/1/counters/link_error_recovery | ||
135 | Lines: 1 | ||
136 | N/A (no PMA) | ||
137 | Mode: 644 | ||
138 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
139 | Path: sys/class/infiniband/i40iw0/ports/1/counters/local_link_integrity_errors | ||
140 | Lines: 1 | ||
141 | N/A (no PMA) | ||
142 | Mode: 644 | ||
143 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
144 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_constraint_errors | ||
145 | Lines: 1 | ||
146 | N/A (no PMA) | ||
147 | Mode: 644 | ||
148 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
149 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_data | ||
150 | Lines: 1 | ||
151 | N/A (no PMA) | ||
152 | Mode: 644 | ||
153 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
154 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_errors | ||
155 | Lines: 1 | ||
156 | N/A (no PMA) | ||
157 | Mode: 644 | ||
158 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
159 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_packets | ||
160 | Lines: 1 | ||
161 | N/A (no PMA) | ||
162 | Mode: 644 | ||
163 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
164 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_remote_physical_errors | ||
165 | Lines: 1 | ||
166 | N/A (no PMA) | ||
167 | Mode: 644 | ||
168 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
169 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_switch_relay_errors | ||
170 | Lines: 1 | ||
171 | N/A (no PMA) | ||
172 | Mode: 644 | ||
173 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
174 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_constraint_errors | ||
175 | Lines: 1 | ||
176 | N/A (no PMA) | ||
177 | Mode: 644 | ||
178 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
179 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_data | ||
180 | Lines: 1 | ||
181 | N/A (no PMA) | ||
182 | Mode: 644 | ||
183 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
184 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_discards | ||
185 | Lines: 1 | ||
186 | N/A (no PMA) | ||
187 | Mode: 644 | ||
188 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
189 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_packets | ||
190 | Lines: 1 | ||
191 | N/A (no PMA) | ||
192 | Mode: 644 | ||
193 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
194 | Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_wait | ||
195 | Lines: 1 | ||
196 | N/A (no PMA) | ||
197 | Mode: 644 | ||
198 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
199 | Path: sys/class/infiniband/i40iw0/ports/1/counters/symbol_error | ||
200 | Lines: 1 | ||
201 | N/A (no PMA) | ||
202 | Mode: 644 | ||
203 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
204 | Path: sys/class/infiniband/i40iw0/ports/1/counters/VL15_dropped | ||
205 | Lines: 1 | ||
206 | N/A (no PMA) | ||
207 | Mode: 644 | ||
208 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
112 | Directory: sys/class/infiniband/mlx4_0 | 209 | Directory: sys/class/infiniband/mlx4_0 |
113 | Mode: 755 | 210 | Mode: 755 |
114 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | 211 | # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go index 7c21976..660e4bd 100644 --- a/collector/infiniband_linux.go +++ b/collector/infiniband_linux.go | |||
@@ -20,6 +20,7 @@ import ( | |||
20 | "errors" | 20 | "errors" |
21 | "os" | 21 | "os" |
22 | "path/filepath" | 22 | "path/filepath" |
23 | "strings" | ||
23 | 24 | ||
24 | "github.com/prometheus/client_golang/prometheus" | 25 | "github.com/prometheus/client_golang/prometheus" |
25 | "github.com/prometheus/common/log" | 26 | "github.com/prometheus/common/log" |
@@ -144,6 +145,15 @@ func infinibandPorts(infinibandPath, device string) ([]string, error) { | |||
144 | func readMetric(directory, metricFile string) (uint64, error) { | 145 | func readMetric(directory, metricFile string) (uint64, error) { |
145 | metric, err := readUintFromFile(filepath.Join(directory, metricFile)) | 146 | metric, err := readUintFromFile(filepath.Join(directory, metricFile)) |
146 | if err != nil { | 147 | if err != nil { |
148 | // Ugly workaround for handling #966, when counters are | ||
149 | // `N/A (not available)`. | ||
150 | // This was already patched and submitted, see | ||
151 | // https://www.spinics.net/lists/linux-rdma/msg68596.html | ||
152 | // Remove this as soon as the fix lands in the enterprise distros. | ||
153 | if strings.Contains(err.Error(), "N/A (no PMA)") { | ||
154 | log.Debugf("%q value is N/A", metricFile) | ||
155 | return 0, nil | ||
156 | } | ||
147 | log.Debugf("Error reading %q file", metricFile) | 157 | log.Debugf("Error reading %q file", metricFile) |
148 | return 0, err | 158 | return 0, err |
149 | } | 159 | } |
diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go index 68370c0..d2090f8 100644 --- a/collector/infiniband_linux_test.go +++ b/collector/infiniband_linux_test.go | |||
@@ -23,7 +23,7 @@ func TestInfiniBandDevices(t *testing.T) { | |||
23 | t.Fatal(err) | 23 | t.Fatal(err) |
24 | } | 24 | } |
25 | 25 | ||
26 | if l := len(devices); l != 1 { | 26 | if l := len(devices); l != 2 { |
27 | t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) | 27 | t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) |
28 | } | 28 | } |
29 | } | 29 | } |