aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Kornberger <jk+github@digineo.de>2019-02-27 22:19:55 +0100
committerBen Kochie <superq@gmail.com>2019-02-27 22:19:55 +0100
commit5110efc1cddecb02a66f2048ed8339c51d6615e0 (patch)
tree6b62fd3e6b8927c7068bf79db8c5c6b315346501
parent8ca1e5594bf5ff8fecdc9c30a2ea7b6055d6a646 (diff)
downloadprometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.tar.bz2
prometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.tar.xz
prometheus_node_collector-5110efc1cddecb02a66f2048ed8339c51d6615e0.zip
Translate smartmon.py to Python (#1225)
* Add smartmon.py python port of the smartmon.sh bash script Signed-off-by: Arthur Skowronek <ags@digineo.de>
-rwxr-xr-xtext_collector_examples/smartmon.py375
1 files changed, 375 insertions, 0 deletions
diff --git a/text_collector_examples/smartmon.py b/text_collector_examples/smartmon.py
new file mode 100755
index 0000000..4eb1075
--- /dev/null
+++ b/text_collector_examples/smartmon.py
@@ -0,0 +1,375 @@
1#!/usr/bin/env python3
2import argparse
3import collections
4import csv
5import datetime
6import decimal
7import re
8import subprocess
9import shlex
10
11device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$')
12
13ata_error_count_re = re.compile(
14 r'^Error (\d+) \[\d+\] occurred', re.MULTILINE)
15
16device_info_map = {
17 'Vendor': 'vendor',
18 'Product': 'product',
19 'Revision': 'revision',
20 'Logical Unit id': 'lun_id',
21 'Model Family': 'model_family',
22 'Device Model': 'device_model',
23 'Serial Number': 'serial_number',
24 'Firmware Version': 'firmware_version',
25}
26
27smart_attributes_whitelist = {
28 'airflow_temperature_cel',
29 'command_timeout',
30 'current_pending_sector',
31 'end_to_end_error',
32 'erase_fail_count_total',
33 'g_sense_error_rate',
34 'hardware_ecc_recovered',
35 'host_reads_mib',
36 'host_reads_32mib',
37 'host_writes_mib',
38 'host_writes_32mib',
39 'load_cycle_count',
40 'media_wearout_indicator',
41 'wear_leveling_count',
42 'nand_writes_1gib',
43 'offline_uncorrectable',
44 'power_cycle_count',
45 'power_on_hours',
46 'program_fail_count',
47 'raw_read_error_rate',
48 'reallocated_event_count',
49 'reallocated_sector_ct',
50 'reported_uncorrect',
51 'sata_downshift_count',
52 'seek_error_rate',
53 'spin_retry_count',
54 'spin_up_time',
55 'start_stop_count',
56 'temperature_case',
57 'temperature_celsius',
58 'temperature_internal',
59 'total_lbas_read',
60 'total_lbas_written',
61 'udma_crc_error_count',
62 'unsafe_shutdown_count',
63 'workld_host_reads_perc',
64 'workld_media_wear_indic',
65 'workload_minutes',
66}
67
68Metric = collections.namedtuple('Metric', 'name labels value')
69
70SmartAttribute = collections.namedtuple('SmartAttribute', [
71 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated',
72 'when_failed', 'raw_value',
73])
74
75
76class Device(collections.namedtuple('DeviceBase', 'path opts')):
77 """Representation of a device as found by smartctl --scan output."""
78
79 @property
80 def type(self):
81 return self.opts.type
82
83 @property
84 def base_labels(self):
85 return {'disk': self.path}
86
87 def smartctl_select(self):
88 return ['--device', self.type, self.path]
89
90
91def metric_key(metric, prefix=''):
92 return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric)
93
94
95def metric_format(metric, prefix=''):
96 key = metric_key(metric, prefix)
97 labels = ','.join(
98 '{k}="{v}"'.format(k=k, v=v) for k, v in metric.labels.items())
99 value = decimal.Decimal(metric.value)
100
101 return '{key}{{{labels}}} {value}'.format(
102 key=key, labels=labels, value=value)
103
104
105def metric_print_meta(metric, prefix=''):
106 key = metric_key(metric, prefix)
107 print('# HELP {key} SMART metric {metric.name}'.format(
108 key=key, metric=metric))
109 print('# TYPE {key} gauge'.format(key=key, metric=metric))
110
111
112def metric_print(metric, prefix=''):
113 print(metric_format(metric, prefix))
114
115
116def smart_ctl(*args, check=True):
117 """Wrapper around invoking the smartctl binary.
118
119 Returns:
120 (str) Data piped to stdout by the smartctl subprocess.
121 """
122 return subprocess.run(
123 ['smartctl', *args], stdout=subprocess.PIPE, check=check,
124 ).stdout.decode('utf-8')
125
126
127def smart_ctl_version():
128 return smart_ctl('-V').split('\n')[0].split()[1]
129
130
131def find_devices():
132 """Find SMART devices.
133
134 Yields:
135 (Device) Single device found by smartctl.
136 """
137 parser = argparse.ArgumentParser()
138 parser.add_argument('-d', '--device', dest='type')
139
140 devices = smart_ctl('--scan-open')
141
142 for device in devices.split('\n'):
143 device = device.strip()
144 if not device:
145 continue
146
147 tokens = shlex.split(device, comments=True)
148 if not tokens:
149 continue
150
151 yield Device(tokens[0], parser.parse_args(tokens[1:]))
152
153
154def device_is_active(device):
155 """Returns whenever the given device is currently active or not.
156
157 Args:
158 device: (Device) Device in question.
159
160 Returns:
161 (bool) True if the device is active and False otherwise.
162 """
163 try:
164 smart_ctl('--nocheck', 'standby', *device.smartctl_select())
165 except subprocess.CalledProcessError:
166 return False
167
168 return True
169
170
171def device_info(device):
172 """Query device for basic model information.
173
174 Args:
175 device: (Device) Device in question.
176
177 Returns:
178 (generator): Generator yielding:
179
180 key (str): Key describing the value.
181 value (str): Actual value.
182 """
183 info_lines = smart_ctl(
184 '--info', *device.smartctl_select()
185 ).strip().split('\n')[3:]
186
187 matches = (device_info_re.match(l) for l in info_lines)
188 return (m.groups() for m in matches if m is not None)
189
190
191def device_smart_capabilities(device):
192 """Returns SMART capabilities of the given device.
193
194 Args:
195 device: (Device) Device in question.
196
197 Returns:
198 (tuple): tuple containing:
199
200 (bool): True whenever SMART is available, False otherwise.
201 (bool): True whenever SMART is enabled, False otherwise.
202 """
203 groups = device_info(device)
204
205 state = {
206 g[1].split(' ', 1)[0]
207 for g in groups if g[0] == 'SMART support'}
208
209 smart_available = 'Available' in state
210 smart_enabled = 'Enabled' in state
211
212 return smart_available, smart_enabled
213
214
215def collect_device_info(device):
216 """Collect basic device information.
217
218 Args:
219 device: (Device) Device in question.
220
221 Yields:
222 (Metric) metrics describing general device information.
223 """
224 values = dict(device_info(device))
225 yield Metric('device_info', {
226 **device.base_labels,
227 **{v: values[k] for k, v in device_info_map.items() if k in values}
228 }, True)
229
230
231def collect_device_health_self_assessment(device):
232 """Collect metric about the device health self assessment.
233
234 Args:
235 device: (Device) Device in question.
236
237 Yields:
238 (Metric) Device health self assessment.
239 """
240 out = smart_ctl(
241 '--health', *device.smartctl_select()
242 ).strip().split('\n')
243
244 self_assessment_passed = \
245 out[4].endswith('PASSED') or out[4].endswith('OK')
246
247 yield Metric(
248 'device_smart_healthy', device.base_labels, self_assessment_passed)
249
250
251def collect_ata_metrics(device):
252 # Fetch SMART attributes for the given device.
253 attributes = smart_ctl(
254 '--attributes', *device.smartctl_select()
255 )
256
257 # replace multiple occurrences of whitespace with a single whitespace
258 # so that the CSV Parser recognizes individual columns properly.
259 attributes = re.sub(r'[\t\x20]+', ' ', attributes)
260
261 # Turn smartctl output into a list of lines and skip to the table of
262 # SMART attributes.
263 attribute_lines = attributes.strip().split('\n')[7:]
264
265 reader = csv.DictReader(
266 (l.strip() for l in attribute_lines),
267 fieldnames=SmartAttribute._fields[:-1],
268 restkey=SmartAttribute._fields[-1], delimiter=' ')
269 for entry in reader:
270 # We're only interested in the SMART attributes that are
271 # whitelisted here.
272 entry['name'] = entry['name'].lower()
273 if entry['name'] not in smart_attributes_whitelist:
274 continue
275
276 # Ensure that only the numeric parts are fetched from the raw_value.
277 # Attributes such as 194 Temperature_Celsius reported by my SSD
278 # are in the format of "36 (Min/Max 24/40)" which can't be expressed
279 # properly as a prometheus metric.
280 m = re.match('^(\d+)', ' '.join(entry['raw_value']))
281 if not m:
282 continue
283 entry['raw_value'] = m.group(1)
284
285 if entry['name'] in smart_attributes_whitelist:
286 labels = {
287 'name': entry['name'],
288 **device.base_labels,
289 }
290
291 for col in 'value', 'worst', 'threshold':
292 yield Metric(
293 'attr_{col}'.format(name=entry["name"], col=col),
294 labels, entry[col])
295
296
297def collect_ata_error_count(device):
298 """Inspect the device error log and report the amount of entries.
299
300 Args:
301 device: (Device) Device in question.
302
303 Yields:
304 (Metric) Device error count.
305 """
306 error_log = smart_ctl(
307 '-l', 'xerror,1', *device.smartctl_select(), check=False)
308
309 m = ata_error_count_re.search(error_log)
310
311 error_count = m.group(1) if m is not None else 0
312
313 yield Metric('device_errors', device.base_labels, error_count)
314
315
316def collect_disks_smart_metrics():
317 now = int(datetime.datetime.utcnow().timestamp())
318
319 for device in find_devices():
320 yield Metric('smartctl_run', device.base_labels, now)
321
322 is_active = device_is_active(device)
323
324 yield Metric('device_active', device.base_labels, is_active)
325
326 # Skip further metrics collection to prevent the disk from
327 # spinning up.
328 if not is_active:
329 continue
330
331 yield from collect_device_info(device)
332
333 smart_available, smart_enabled = device_smart_capabilities(device)
334
335 yield Metric(
336 'device_smart_available', device.base_labels, smart_available)
337 yield Metric(
338 'device_smart_enabled', device.base_labels, smart_enabled)
339
340 # Skip further metrics collection here if SMART is disabled
341 # on the device. Further smartctl invocations would fail
342 # anyways.
343 if not smart_available:
344 continue
345
346 yield from collect_device_health_self_assessment(device)
347
348 if device.type.startswith('sat'):
349 yield from collect_ata_metrics(device)
350
351 yield from collect_ata_error_count(device)
352
353
354def main():
355 version_metric = Metric('smartctl_version', {
356 'version': smart_ctl_version()
357 }, True)
358 metric_print_meta(version_metric, 'smartmon_')
359 metric_print(version_metric, 'smartmon_')
360
361 metrics = list(collect_disks_smart_metrics())
362 metrics.sort(key=lambda i: i.name)
363
364 previous_name = None
365 for m in metrics:
366 if m.name != previous_name:
367 metric_print_meta(m, 'smartmon_')
368
369 previous_name = m.name
370
371 metric_print(m, 'smartmon_')
372
373if __name__ == '__main__':
374 main()
375