diff options
author | Julian Kornberger <jk+github@digineo.de> | 2019-02-27 22:19:55 +0100 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2019-02-27 22:19:55 +0100 |
commit | 5110efc1cddecb02a66f2048ed8339c51d6615e0 (patch) | |
tree | 6b62fd3e6b8927c7068bf79db8c5c6b315346501 | |
parent | 8ca1e5594bf5ff8fecdc9c30a2ea7b6055d6a646 (diff) | |
download | prometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.tar.bz2 prometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.tar.xz prometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.zip |
Translate smartmon.py to Python (#1225)
* Add smartmon.py python port of the smartmon.sh bash script
Signed-off-by: Arthur Skowronek <ags@digineo.de>
-rwxr-xr-x | text_collector_examples/smartmon.py | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/text_collector_examples/smartmon.py b/text_collector_examples/smartmon.py new file mode 100755 index 0000000..4eb1075 --- /dev/null +++ b/text_collector_examples/smartmon.py | |||
@@ -0,0 +1,375 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | import argparse | ||
3 | import collections | ||
4 | import csv | ||
5 | import datetime | ||
6 | import decimal | ||
7 | import re | ||
8 | import subprocess | ||
9 | import shlex | ||
10 | |||
11 | device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$') | ||
12 | |||
13 | ata_error_count_re = re.compile( | ||
14 | r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) | ||
15 | |||
16 | device_info_map = { | ||
17 | 'Vendor': 'vendor', | ||
18 | 'Product': 'product', | ||
19 | 'Revision': 'revision', | ||
20 | 'Logical Unit id': 'lun_id', | ||
21 | 'Model Family': 'model_family', | ||
22 | 'Device Model': 'device_model', | ||
23 | 'Serial Number': 'serial_number', | ||
24 | 'Firmware Version': 'firmware_version', | ||
25 | } | ||
26 | |||
27 | smart_attributes_whitelist = { | ||
28 | 'airflow_temperature_cel', | ||
29 | 'command_timeout', | ||
30 | 'current_pending_sector', | ||
31 | 'end_to_end_error', | ||
32 | 'erase_fail_count_total', | ||
33 | 'g_sense_error_rate', | ||
34 | 'hardware_ecc_recovered', | ||
35 | 'host_reads_mib', | ||
36 | 'host_reads_32mib', | ||
37 | 'host_writes_mib', | ||
38 | 'host_writes_32mib', | ||
39 | 'load_cycle_count', | ||
40 | 'media_wearout_indicator', | ||
41 | 'wear_leveling_count', | ||
42 | 'nand_writes_1gib', | ||
43 | 'offline_uncorrectable', | ||
44 | 'power_cycle_count', | ||
45 | 'power_on_hours', | ||
46 | 'program_fail_count', | ||
47 | 'raw_read_error_rate', | ||
48 | 'reallocated_event_count', | ||
49 | 'reallocated_sector_ct', | ||
50 | 'reported_uncorrect', | ||
51 | 'sata_downshift_count', | ||
52 | 'seek_error_rate', | ||
53 | 'spin_retry_count', | ||
54 | 'spin_up_time', | ||
55 | 'start_stop_count', | ||
56 | 'temperature_case', | ||
57 | 'temperature_celsius', | ||
58 | 'temperature_internal', | ||
59 | 'total_lbas_read', | ||
60 | 'total_lbas_written', | ||
61 | 'udma_crc_error_count', | ||
62 | 'unsafe_shutdown_count', | ||
63 | 'workld_host_reads_perc', | ||
64 | 'workld_media_wear_indic', | ||
65 | 'workload_minutes', | ||
66 | } | ||
67 | |||
68 | Metric = collections.namedtuple('Metric', 'name labels value') | ||
69 | |||
70 | SmartAttribute = collections.namedtuple('SmartAttribute', [ | ||
71 | 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', | ||
72 | 'when_failed', 'raw_value', | ||
73 | ]) | ||
74 | |||
75 | |||
76 | class Device(collections.namedtuple('DeviceBase', 'path opts')): | ||
77 | """Representation of a device as found by smartctl --scan output.""" | ||
78 | |||
79 | @property | ||
80 | def type(self): | ||
81 | return self.opts.type | ||
82 | |||
83 | @property | ||
84 | def base_labels(self): | ||
85 | return {'disk': self.path} | ||
86 | |||
87 | def smartctl_select(self): | ||
88 | return ['--device', self.type, self.path] | ||
89 | |||
90 | |||
91 | def metric_key(metric, prefix=''): | ||
92 | return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) | ||
93 | |||
94 | |||
95 | def metric_format(metric, prefix=''): | ||
96 | key = metric_key(metric, prefix) | ||
97 | labels = ','.join( | ||
98 | '{k}="{v}"'.format(k=k, v=v) for k, v in metric.labels.items()) | ||
99 | value = decimal.Decimal(metric.value) | ||
100 | |||
101 | return '{key}{{{labels}}} {value}'.format( | ||
102 | key=key, labels=labels, value=value) | ||
103 | |||
104 | |||
105 | def metric_print_meta(metric, prefix=''): | ||
106 | key = metric_key(metric, prefix) | ||
107 | print('# HELP {key} SMART metric {metric.name}'.format( | ||
108 | key=key, metric=metric)) | ||
109 | print('# TYPE {key} gauge'.format(key=key, metric=metric)) | ||
110 | |||
111 | |||
112 | def metric_print(metric, prefix=''): | ||
113 | print(metric_format(metric, prefix)) | ||
114 | |||
115 | |||
116 | def smart_ctl(*args, check=True): | ||
117 | """Wrapper around invoking the smartctl binary. | ||
118 | |||
119 | Returns: | ||
120 | (str) Data piped to stdout by the smartctl subprocess. | ||
121 | """ | ||
122 | return subprocess.run( | ||
123 | ['smartctl', *args], stdout=subprocess.PIPE, check=check, | ||
124 | ).stdout.decode('utf-8') | ||
125 | |||
126 | |||
127 | def smart_ctl_version(): | ||
128 | return smart_ctl('-V').split('\n')[0].split()[1] | ||
129 | |||
130 | |||
131 | def find_devices(): | ||
132 | """Find SMART devices. | ||
133 | |||
134 | Yields: | ||
135 | (Device) Single device found by smartctl. | ||
136 | """ | ||
137 | parser = argparse.ArgumentParser() | ||
138 | parser.add_argument('-d', '--device', dest='type') | ||
139 | |||
140 | devices = smart_ctl('--scan-open') | ||
141 | |||
142 | for device in devices.split('\n'): | ||
143 | device = device.strip() | ||
144 | if not device: | ||
145 | continue | ||
146 | |||
147 | tokens = shlex.split(device, comments=True) | ||
148 | if not tokens: | ||
149 | continue | ||
150 | |||
151 | yield Device(tokens[0], parser.parse_args(tokens[1:])) | ||
152 | |||
153 | |||
154 | def device_is_active(device): | ||
155 | """Returns whenever the given device is currently active or not. | ||
156 | |||
157 | Args: | ||
158 | device: (Device) Device in question. | ||
159 | |||
160 | Returns: | ||
161 | (bool) True if the device is active and False otherwise. | ||
162 | """ | ||
163 | try: | ||
164 | smart_ctl('--nocheck', 'standby', *device.smartctl_select()) | ||
165 | except subprocess.CalledProcessError: | ||
166 | return False | ||
167 | |||
168 | return True | ||
169 | |||
170 | |||
171 | def device_info(device): | ||
172 | """Query device for basic model information. | ||
173 | |||
174 | Args: | ||
175 | device: (Device) Device in question. | ||
176 | |||
177 | Returns: | ||
178 | (generator): Generator yielding: | ||
179 | |||
180 | key (str): Key describing the value. | ||
181 | value (str): Actual value. | ||
182 | """ | ||
183 | info_lines = smart_ctl( | ||
184 | '--info', *device.smartctl_select() | ||
185 | ).strip().split('\n')[3:] | ||
186 | |||
187 | matches = (device_info_re.match(l) for l in info_lines) | ||
188 | return (m.groups() for m in matches if m is not None) | ||
189 | |||
190 | |||
191 | def device_smart_capabilities(device): | ||
192 | """Returns SMART capabilities of the given device. | ||
193 | |||
194 | Args: | ||
195 | device: (Device) Device in question. | ||
196 | |||
197 | Returns: | ||
198 | (tuple): tuple containing: | ||
199 | |||
200 | (bool): True whenever SMART is available, False otherwise. | ||
201 | (bool): True whenever SMART is enabled, False otherwise. | ||
202 | """ | ||
203 | groups = device_info(device) | ||
204 | |||
205 | state = { | ||
206 | g[1].split(' ', 1)[0] | ||
207 | for g in groups if g[0] == 'SMART support'} | ||
208 | |||
209 | smart_available = 'Available' in state | ||
210 | smart_enabled = 'Enabled' in state | ||
211 | |||
212 | return smart_available, smart_enabled | ||
213 | |||
214 | |||
215 | def collect_device_info(device): | ||
216 | """Collect basic device information. | ||
217 | |||
218 | Args: | ||
219 | device: (Device) Device in question. | ||
220 | |||
221 | Yields: | ||
222 | (Metric) metrics describing general device information. | ||
223 | """ | ||
224 | values = dict(device_info(device)) | ||
225 | yield Metric('device_info', { | ||
226 | **device.base_labels, | ||
227 | **{v: values[k] for k, v in device_info_map.items() if k in values} | ||
228 | }, True) | ||
229 | |||
230 | |||
231 | def collect_device_health_self_assessment(device): | ||
232 | """Collect metric about the device health self assessment. | ||
233 | |||
234 | Args: | ||
235 | device: (Device) Device in question. | ||
236 | |||
237 | Yields: | ||
238 | (Metric) Device health self assessment. | ||
239 | """ | ||
240 | out = smart_ctl( | ||
241 | '--health', *device.smartctl_select() | ||
242 | ).strip().split('\n') | ||
243 | |||
244 | self_assessment_passed = \ | ||
245 | out[4].endswith('PASSED') or out[4].endswith('OK') | ||
246 | |||
247 | yield Metric( | ||
248 | 'device_smart_healthy', device.base_labels, self_assessment_passed) | ||
249 | |||
250 | |||
251 | def collect_ata_metrics(device): | ||
252 | # Fetch SMART attributes for the given device. | ||
253 | attributes = smart_ctl( | ||
254 | '--attributes', *device.smartctl_select() | ||
255 | ) | ||
256 | |||
257 | # replace multiple occurrences of whitespace with a single whitespace | ||
258 | # so that the CSV Parser recognizes individual columns properly. | ||
259 | attributes = re.sub(r'[\t\x20]+', ' ', attributes) | ||
260 | |||
261 | # Turn smartctl output into a list of lines and skip to the table of | ||
262 | # SMART attributes. | ||
263 | attribute_lines = attributes.strip().split('\n')[7:] | ||
264 | |||
265 | reader = csv.DictReader( | ||
266 | (l.strip() for l in attribute_lines), | ||
267 | fieldnames=SmartAttribute._fields[:-1], | ||
268 | restkey=SmartAttribute._fields[-1], delimiter=' ') | ||
269 | for entry in reader: | ||
270 | # We're only interested in the SMART attributes that are | ||
271 | # whitelisted here. | ||
272 | entry['name'] = entry['name'].lower() | ||
273 | if entry['name'] not in smart_attributes_whitelist: | ||
274 | continue | ||
275 | |||
276 | # Ensure that only the numeric parts are fetched from the raw_value. | ||
277 | # Attributes such as 194 Temperature_Celsius reported by my SSD | ||
278 | # are in the format of "36 (Min/Max 24/40)" which can't be expressed | ||
279 | # properly as a prometheus metric. | ||
280 | m = re.match('^(\d+)', ' '.join(entry['raw_value'])) | ||
281 | if not m: | ||
282 | continue | ||
283 | entry['raw_value'] = m.group(1) | ||
284 | |||
285 | if entry['name'] in smart_attributes_whitelist: | ||
286 | labels = { | ||
287 | 'name': entry['name'], | ||
288 | **device.base_labels, | ||
289 | } | ||
290 | |||
291 | for col in 'value', 'worst', 'threshold': | ||
292 | yield Metric( | ||
293 | 'attr_{col}'.format(name=entry["name"], col=col), | ||
294 | labels, entry[col]) | ||
295 | |||
296 | |||
297 | def collect_ata_error_count(device): | ||
298 | """Inspect the device error log and report the amount of entries. | ||
299 | |||
300 | Args: | ||
301 | device: (Device) Device in question. | ||
302 | |||
303 | Yields: | ||
304 | (Metric) Device error count. | ||
305 | """ | ||
306 | error_log = smart_ctl( | ||
307 | '-l', 'xerror,1', *device.smartctl_select(), check=False) | ||
308 | |||
309 | m = ata_error_count_re.search(error_log) | ||
310 | |||
311 | error_count = m.group(1) if m is not None else 0 | ||
312 | |||
313 | yield Metric('device_errors', device.base_labels, error_count) | ||
314 | |||
315 | |||
316 | def collect_disks_smart_metrics(): | ||
317 | now = int(datetime.datetime.utcnow().timestamp()) | ||
318 | |||
319 | for device in find_devices(): | ||
320 | yield Metric('smartctl_run', device.base_labels, now) | ||
321 | |||
322 | is_active = device_is_active(device) | ||
323 | |||
324 | yield Metric('device_active', device.base_labels, is_active) | ||
325 | |||
326 | # Skip further metrics collection to prevent the disk from | ||
327 | # spinning up. | ||
328 | if not is_active: | ||
329 | continue | ||
330 | |||
331 | yield from collect_device_info(device) | ||
332 | |||
333 | smart_available, smart_enabled = device_smart_capabilities(device) | ||
334 | |||
335 | yield Metric( | ||
336 | 'device_smart_available', device.base_labels, smart_available) | ||
337 | yield Metric( | ||
338 | 'device_smart_enabled', device.base_labels, smart_enabled) | ||
339 | |||
340 | # Skip further metrics collection here if SMART is disabled | ||
341 | # on the device. Further smartctl invocations would fail | ||
342 | # anyways. | ||
343 | if not smart_available: | ||
344 | continue | ||
345 | |||
346 | yield from collect_device_health_self_assessment(device) | ||
347 | |||
348 | if device.type.startswith('sat'): | ||
349 | yield from collect_ata_metrics(device) | ||
350 | |||
351 | yield from collect_ata_error_count(device) | ||
352 | |||
353 | |||
354 | def main(): | ||
355 | version_metric = Metric('smartctl_version', { | ||
356 | 'version': smart_ctl_version() | ||
357 | }, True) | ||
358 | metric_print_meta(version_metric, 'smartmon_') | ||
359 | metric_print(version_metric, 'smartmon_') | ||
360 | |||
361 | metrics = list(collect_disks_smart_metrics()) | ||
362 | metrics.sort(key=lambda i: i.name) | ||
363 | |||
364 | previous_name = None | ||
365 | for m in metrics: | ||
366 | if m.name != previous_name: | ||
367 | metric_print_meta(m, 'smartmon_') | ||
368 | |||
369 | previous_name = m.name | ||
370 | |||
371 | metric_print(m, 'smartmon_') | ||
372 | |||
373 | if __name__ == '__main__': | ||
374 | main() | ||
375 | |||