diff options
author | Christopher Blum <zeichenanonym@web.de> | 2018-11-07 17:12:23 +0100 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2018-11-07 17:12:23 +0100 |
commit | 1b98db9fa72abe93541fb1a7140388504601e303 (patch) | |
tree | b4d8fc9fbe1c324ff6e51d498adc2c76b1b3da10 /text_collector_examples | |
parent | 29d4629f55603001e25fea65e8cb593b86e58f47 (diff) | |
download | prometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.tar.bz2 prometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.tar.xz prometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.zip |
textfile example storcli enhancements (#1145)
* storcli.py: Remove IntEnum
This removes an external dependency.
Moved VD state to VD info labels
* storcli.py: Fix BBU health detection
BBU Status is 0 for a healthy cache vault and 32 for a healthy BBU.
* storcli.py: Strip all strings from PD
Strip all strings that we get from PDs.
They often contain whitespaces....
* storcli.py: Add formatting options
Add help text explaining how this documented was formatted
* storcli.py: Add DG to pd_info label
Add disk group to pd_info.
That way we can relate to PDs in the same DG.
For example to check if all disks in one RAID
use the same interface...
* storcli.py: Fix promtool issues
Fix linting issues reported by promtool check-metrics
* storcli.py: Exit if storcli reports issues
storcli reports if the command was a success.
We should not continue if there are issues.
* storcli.py: Try to parse metrics to float
This will sanitize the values we hand over to
node_exporter - eliminating any unforeseen values we read out...
* storcli.py: Refactor code to implement handle_sas_controller()
Move code into methods so that we can now also support HBA queries.
* storcli.py: Sort inputs
"...like a good python developer"
- Daniel Swarbrick
* storcli.py: Replace external dateutil library with internal datetime
Removes external dependency...
* storcli.py: Also collect temperature on megaraid cards
We have already collected them on mpt3sas cards...
* storcli.py: Clean up old code
Removed dead code that is not used any more.
* storcli.py: strip() all information for labels
They often contain whitespaces...
* storcli.py: Try to catch KeyErrors generally
If some key we expect is not there, we will want to
still print whatever we have collected so far...
* storcli.py: Increment version number
We have made some changes here and there.
The general look of the data has not been changed.
* storcli.py: Fix CodeSpell issue
Split string to avoid issues with Codespell due to Celcius in JSON Key
Signed-off-by: Christopher Blum <zeichenanonym@web.de>
Diffstat (limited to 'text_collector_examples')
-rwxr-xr-x | text_collector_examples/storcli.py | 214 |
1 files changed, 122 insertions, 92 deletions
diff --git a/text_collector_examples/storcli.py b/text_collector_examples/storcli.py index 48e2bba..65fc00f 100755 --- a/text_collector_examples/storcli.py +++ b/text_collector_examples/storcli.py | |||
@@ -12,21 +12,23 @@ Advanced Software Options (ASO) not exposed as metrics currently. | |||
12 | 12 | ||
13 | JSON key abbreviations used by StorCLI are documented in the standard command | 13 | JSON key abbreviations used by StorCLI are documented in the standard command |
14 | output, i.e. when you omit the trailing 'J' from the command. | 14 | output, i.e. when you omit the trailing 'J' from the command. |
15 | |||
16 | Formatting done with YAPF: | ||
17 | $ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py | ||
15 | """ | 18 | """ |
16 | 19 | ||
17 | from __future__ import print_function | 20 | from __future__ import print_function |
21 | from datetime import datetime | ||
18 | import argparse | 22 | import argparse |
23 | import collections | ||
19 | import json | 24 | import json |
20 | import os | 25 | import os |
21 | import subprocess | ||
22 | import shlex | 26 | import shlex |
23 | from dateutil.parser import parse | 27 | import subprocess |
24 | import collections | ||
25 | from enum import IntEnum | ||
26 | 28 | ||
27 | DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as | 29 | DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as |
28 | Prometheus metrics.""" | 30 | Prometheus metrics.""" |
29 | VERSION = '0.0.2' | 31 | VERSION = '0.0.3' |
30 | 32 | ||
31 | storcli_path = '' | 33 | storcli_path = '' |
32 | metric_prefix = 'megaraid_' | 34 | metric_prefix = 'megaraid_' |
@@ -34,59 +36,55 @@ metric_list = {} | |||
34 | metric_list = collections.defaultdict(list) | 36 | metric_list = collections.defaultdict(list) |
35 | 37 | ||
36 | 38 | ||
37 | class VD_State(IntEnum): | ||
38 | Optl = 0 # Optimal | ||
39 | Dgrd = 1 # Degraded | ||
40 | Pdgd = 2 # Partially Degraded | ||
41 | OfLn = 3 # Offline | ||
42 | Rec = 4 # Recovery | ||
43 | Cac = 5 # CacheCade | ||
44 | |||
45 | |||
46 | def main(args): | 39 | def main(args): |
47 | """ main """ | 40 | """ main """ |
48 | global storcli_path | 41 | global storcli_path |
49 | storcli_path = args.storcli_path | 42 | storcli_path = args.storcli_path |
50 | data = json.loads(get_storcli_json('/cALL show all J')) | 43 | data = get_storcli_json('/cALL show all J') |
51 | 44 | ||
52 | # All the information is collected underneath the Controllers key | 45 | try: |
53 | data = data['Controllers'] | 46 | # All the information is collected underneath the Controllers key |
54 | 47 | data = data['Controllers'] | |
55 | # try: | 48 | |
56 | # overview = status['Response Data']['System Overview'] | 49 | for controller in data: |
57 | # except KeyError: | 50 | response = controller['Response Data'] |
58 | # pass | 51 | if response['Version']['Driver Name'] == 'megaraid_sas': |
59 | 52 | handle_megaraid_controller(response) | |
60 | for controller in data: | 53 | elif response['Version']['Driver Name'] == 'mpt3sas': |
61 | response = controller['Response Data'] | 54 | handle_sas_controller(response) |
62 | if response['Version']['Driver Name'] == 'megaraid_sas': | 55 | except KeyError: |
63 | handle_megaraid_controller(response) | 56 | pass |
64 | elif response['Version']['Driver Name'] == 'mpt3sas': | 57 | |
65 | handle_sas_controller(response) | ||
66 | |||
67 | # print_dict_to_exporter({'controller_info': [1]}, controller_info_list) | ||
68 | # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list) | ||
69 | # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list) | ||
70 | # print_all_metrics(vd_metric_list) | ||
71 | print_all_metrics(metric_list) | 58 | print_all_metrics(metric_list) |
72 | 59 | ||
73 | 60 | ||
74 | def handle_sas_controller(response): | 61 | def handle_sas_controller(response): |
75 | pass | 62 | (controller_index, baselabel) = get_basic_controller_info(response) |
63 | add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK')) | ||
64 | add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) | ||
65 | try: | ||
66 | # The number of physical disks is half of the number of items in this dict | ||
67 | # Every disk is listed twice - once for basic info, again for detailed info | ||
68 | add_metric('physical_drives', baselabel, | ||
69 | len(response['Physical Device Information'].keys()) / 2) | ||
70 | except AttributeError: | ||
71 | pass | ||
72 | # Split up string to not trigger CodeSpell issues | ||
73 | add_metric('temperature', baselabel, | ||
74 | int(response['HwCfg']['ROC temperature(Degree Celc' + 'ius)'])) | ||
75 | for key, basic_disk_info in response['Physical Device Information'].items(): | ||
76 | if 'Detailed Information' in key: | ||
77 | continue | ||
78 | create_metrcis_of_physical_drive(basic_disk_info[0], | ||
79 | response['Physical Device Information'], controller_index) | ||
76 | 80 | ||
77 | 81 | ||
78 | def handle_megaraid_controller(response): | 82 | def handle_megaraid_controller(response): |
79 | controller_index = response['Basics']['Controller'] | 83 | (controller_index, baselabel) = get_basic_controller_info(response) |
80 | baselabel = 'controller="{}"'.format(controller_index) | ||
81 | 84 | ||
82 | controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format( | 85 | # BBU Status Optimal value is 0 for cachevault and 32 for BBU |
83 | response['Basics']['Model'], | 86 | add_metric('battery_backup_healthy', baselabel, |
84 | response['Basics']['Serial Number'], | 87 | int(response['Status']['BBU Status'] in [0, 32])) |
85 | response['Version']['Firmware Version'], | ||
86 | ) | ||
87 | add_metric('controller_info', controller_info_label, 1) | ||
88 | |||
89 | add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0)) | ||
90 | add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) | 88 | add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) |
91 | add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) | 89 | add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) |
92 | add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) | 90 | add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) |
@@ -96,10 +94,13 @@ def handle_megaraid_controller(response): | |||
96 | add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) | 94 | add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) |
97 | add_metric('scheduled_patrol_read', baselabel, | 95 | add_metric('scheduled_patrol_read', baselabel, |
98 | int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) | 96 | int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) |
97 | add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)'])) | ||
99 | 98 | ||
100 | time_difference_seconds = -1 | 99 | time_difference_seconds = -1 |
101 | system_time = parse(response['Basics'].get('Current System Date/time')) | 100 | system_time = datetime.strptime(response['Basics'].get('Current System Date/time'), |
102 | controller_time = parse(response['Basics'].get('Current Controller Date/Time')) | 101 | "%m/%d/%Y, %H:%M:%S") |
102 | controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'), | ||
103 | "%m/%d/%Y, %H:%M:%S") | ||
103 | if system_time and controller_time: | 104 | if system_time and controller_time: |
104 | time_difference_seconds = abs(system_time - controller_time).seconds | 105 | time_difference_seconds = abs(system_time - controller_time).seconds |
105 | add_metric('time_difference', baselabel, time_difference_seconds) | 106 | add_metric('time_difference', baselabel, time_difference_seconds) |
@@ -112,58 +113,84 @@ def handle_megaraid_controller(response): | |||
112 | volume_group = vd_position.split('/')[1] | 113 | volume_group = vd_position.split('/')[1] |
113 | vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group, | 114 | vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group, |
114 | volume_group) | 115 | volume_group) |
115 | vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format( | 116 | vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}",state="{}"'.format( |
116 | virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE')) | 117 | str(virtual_drive.get('Name')).strip(), |
118 | str(virtual_drive.get('Cache')).strip(), | ||
119 | str(virtual_drive.get('TYPE')).strip(), | ||
120 | str(virtual_drive.get('State')).strip()) | ||
117 | add_metric('vd_info', vd_info_label, 1) | 121 | add_metric('vd_info', vd_info_label, 1) |
118 | add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')])) | ||
119 | 122 | ||
120 | if response['Physical Drives'] > 0: | 123 | if response['Physical Drives'] > 0: |
121 | data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J')) | 124 | data = get_storcli_json('/cALL/eALL/sALL show all J') |
122 | drive_info = data['Controllers'][controller_index]['Response Data'] | 125 | drive_info = data['Controllers'][controller_index]['Response Data'] |
123 | for physical_drive in response['PD LIST']: | 126 | for physical_drive in response['PD LIST']: |
124 | enclosure = physical_drive.get('EID:Slt').split(':')[0] | 127 | create_metrcis_of_physical_drive(physical_drive, drive_info, controller_index) |
125 | slot = physical_drive.get('EID:Slt').split(':')[1] | 128 | |
126 | 129 | ||
127 | pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format( | 130 | def get_basic_controller_info(response): |
128 | controller_index, enclosure, slot) | 131 | controller_index = response['Basics']['Controller'] |
129 | pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format( | 132 | baselabel = 'controller="{}"'.format(controller_index) |
130 | physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'), | 133 | |
131 | physical_drive.get('Model').strip()) | 134 | controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format( |
132 | 135 | str(response['Basics']['Model']).strip(), | |
133 | drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( | 136 | str(response['Basics']['Serial Number']).strip(), |
134 | slot) | 137 | str(response['Version']['Firmware Version']).strip(), |
135 | try: | 138 | ) |
136 | info = drive_info[drive_identifier + ' - Detailed Information'] | 139 | add_metric('controller_info', controller_info_label, 1) |
137 | state = info[drive_identifier + ' State'] | 140 | |
138 | attributes = info[drive_identifier + ' Device attributes'] | 141 | return (controller_index, baselabel) |
139 | settings = info[drive_identifier + ' Policies/Settings'] | 142 | |
140 | 143 | ||
141 | add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) | 144 | def create_metrcis_of_physical_drive(physical_drive, detailed_info_array, controller_index): |
142 | add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count']) | 145 | enclosure = physical_drive.get('EID:Slt').split(':')[0] |
143 | add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count']) | 146 | slot = physical_drive.get('EID:Slt').split(':')[1] |
144 | add_metric('pd_predictive_errors_total', pd_baselabel, | 147 | |
145 | state['Predictive Failure Count']) | 148 | pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(controller_index, enclosure, |
146 | add_metric('pd_smart_alerted', pd_baselabel, | 149 | slot) |
147 | int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) | 150 | pd_info_label = pd_baselabel + \ |
148 | add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) | 151 | ',disk_id="{}",interface="{}",media="{}",model="{}",DG="{}"'.format( |
149 | add_metric('pd_device_speed_gbps', pd_baselabel, | 152 | str(physical_drive.get('DID')).strip(), |
150 | attributes['Device Speed'].split('.')[0]) | 153 | str(physical_drive.get('Intf')).strip(), |
151 | add_metric('pd_commissioned_spare', pd_baselabel, | 154 | str(physical_drive.get('Med')).strip(), |
152 | int(settings['Commissioned Spare'] == 'Yes')) | 155 | str(physical_drive.get('Model')).strip(), |
153 | add_metric('pd_emergency_spare', pd_baselabel, | 156 | str(physical_drive.get('DG')).strip()) |
154 | int(settings['Emergency Spare'] == 'Yes')) | 157 | |
155 | pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision']) | 158 | drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( |
156 | except KeyError: | 159 | slot) |
157 | pass | 160 | if enclosure == ' ': |
158 | add_metric('pd_info', pd_info_label, 1) | 161 | drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot) |
162 | try: | ||
163 | info = detailed_info_array[drive_identifier + ' - Detailed Information'] | ||
164 | state = info[drive_identifier + ' State'] | ||
165 | attributes = info[drive_identifier + ' Device attributes'] | ||
166 | settings = info[drive_identifier + ' Policies/Settings'] | ||
167 | |||
168 | add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) | ||
169 | add_metric('pd_media_errors', pd_baselabel, state['Media Error Count']) | ||
170 | add_metric('pd_other_errors', pd_baselabel, state['Other Error Count']) | ||
171 | add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count']) | ||
172 | add_metric('pd_smart_alerted', pd_baselabel, | ||
173 | int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) | ||
174 | add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) | ||
175 | add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0]) | ||
176 | add_metric('pd_commissioned_spare', pd_baselabel, | ||
177 | int(settings['Commissioned Spare'] == 'Yes')) | ||
178 | add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes')) | ||
179 | pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'].strip()) | ||
180 | except KeyError: | ||
181 | pass | ||
182 | add_metric('pd_info', pd_info_label, 1) | ||
159 | 183 | ||
160 | 184 | ||
161 | def add_metric(name, labels, value): | 185 | def add_metric(name, labels, value): |
162 | global metric_list | 186 | global metric_list |
163 | metric_list[name].append({ | 187 | try: |
164 | 'labels': labels, | 188 | metric_list[name].append({ |
165 | 'value': value, | 189 | 'labels': labels, |
166 | }) | 190 | 'value': float(value), |
191 | }) | ||
192 | except ValueError: | ||
193 | pass | ||
167 | 194 | ||
168 | 195 | ||
169 | def print_all_metrics(metrics): | 196 | def print_all_metrics(metrics): |
@@ -184,8 +211,11 @@ def get_storcli_json(storcli_args): | |||
184 | proc = subprocess.Popen( | 211 | proc = subprocess.Popen( |
185 | storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 212 | storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
186 | output_json = proc.communicate()[0] | 213 | output_json = proc.communicate()[0] |
214 | data = json.loads(output_json.decode("utf-8")) | ||
187 | 215 | ||
188 | return output_json.decode("utf-8") | 216 | if data["Controllers"][0]["Command Status"]["Status"] != "Success": |
217 | SystemExit(1) | ||
218 | return data | ||
189 | 219 | ||
190 | 220 | ||
191 | if __name__ == "__main__": | 221 | if __name__ == "__main__": |