aboutsummaryrefslogtreecommitdiff
path: root/text_collector_examples
diff options
context:
space:
mode:
authorChristopher Blum <zeichenanonym@web.de>2018-11-07 17:12:23 +0100
committerBen Kochie <superq@gmail.com>2018-11-07 17:12:23 +0100
commit1b98db9fa72abe93541fb1a7140388504601e303 (patch)
treeb4d8fc9fbe1c324ff6e51d498adc2c76b1b3da10 /text_collector_examples
parent29d4629f55603001e25fea65e8cb593b86e58f47 (diff)
downloadprometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.tar.bz2
prometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.tar.xz
prometheus_node_collector-1b98db9fa72abe93541fb1a7140388504601e303.zip
textfile example storcli enhancements (#1145)
* storcli.py: Remove IntEnum This removes an external dependency. Moved VD state to VD info labels * storcli.py: Fix BBU health detection BBU Status is 0 for a healthy cache vault and 32 for a healthy BBU. * storcli.py: Strip all strings from PD Strip all strings that we get from PDs. They often contain whitespaces.... * storcli.py: Add formatting options Add help text explaining how this documented was formatted * storcli.py: Add DG to pd_info label Add disk group to pd_info. That way we can relate to PDs in the same DG. For example to check if all disks in one RAID use the same interface... * storcli.py: Fix promtool issues Fix linting issues reported by promtool check-metrics * storcli.py: Exit if storcli reports issues storcli reports if the command was a success. We should not continue if there are issues. * storcli.py: Try to parse metrics to float This will sanitize the values we hand over to node_exporter - eliminating any unforeseen values we read out... * storcli.py: Refactor code to implement handle_sas_controller() Move code into methods so that we can now also support HBA queries. * storcli.py: Sort inputs "...like a good python developer" - Daniel Swarbrick * storcli.py: Replace external dateutil library with internal datetime Removes external dependency... * storcli.py: Also collect temperature on megaraid cards We have already collected them on mpt3sas cards... * storcli.py: Clean up old code Removed dead code that is not used any more. * storcli.py: strip() all information for labels They often contain whitespaces... * storcli.py: Try to catch KeyErrors generally If some key we expect is not there, we will want to still print whatever we have collected so far... * storcli.py: Increment version number We have made some changes here and there. The general look of the data has not been changed. * storcli.py: Fix CodeSpell issue Split string to avoid issues with Codespell due to Celcius in JSON Key Signed-off-by: Christopher Blum <zeichenanonym@web.de>
Diffstat (limited to 'text_collector_examples')
-rwxr-xr-xtext_collector_examples/storcli.py214
1 files changed, 122 insertions, 92 deletions
diff --git a/text_collector_examples/storcli.py b/text_collector_examples/storcli.py
index 48e2bba..65fc00f 100755
--- a/text_collector_examples/storcli.py
+++ b/text_collector_examples/storcli.py
@@ -12,21 +12,23 @@ Advanced Software Options (ASO) not exposed as metrics currently.
12 12
13JSON key abbreviations used by StorCLI are documented in the standard command 13JSON key abbreviations used by StorCLI are documented in the standard command
14output, i.e. when you omit the trailing 'J' from the command. 14output, i.e. when you omit the trailing 'J' from the command.
15
16Formatting done with YAPF:
17$ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py
15""" 18"""
16 19
17from __future__ import print_function 20from __future__ import print_function
21from datetime import datetime
18import argparse 22import argparse
23import collections
19import json 24import json
20import os 25import os
21import subprocess
22import shlex 26import shlex
23from dateutil.parser import parse 27import subprocess
24import collections
25from enum import IntEnum
26 28
27DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as 29DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
28 Prometheus metrics.""" 30 Prometheus metrics."""
29VERSION = '0.0.2' 31VERSION = '0.0.3'
30 32
31storcli_path = '' 33storcli_path = ''
32metric_prefix = 'megaraid_' 34metric_prefix = 'megaraid_'
@@ -34,59 +36,55 @@ metric_list = {}
34metric_list = collections.defaultdict(list) 36metric_list = collections.defaultdict(list)
35 37
36 38
37class VD_State(IntEnum):
38 Optl = 0 # Optimal
39 Dgrd = 1 # Degraded
40 Pdgd = 2 # Partially Degraded
41 OfLn = 3 # Offline
42 Rec = 4 # Recovery
43 Cac = 5 # CacheCade
44
45
46def main(args): 39def main(args):
47 """ main """ 40 """ main """
48 global storcli_path 41 global storcli_path
49 storcli_path = args.storcli_path 42 storcli_path = args.storcli_path
50 data = json.loads(get_storcli_json('/cALL show all J')) 43 data = get_storcli_json('/cALL show all J')
51 44
52 # All the information is collected underneath the Controllers key 45 try:
53 data = data['Controllers'] 46 # All the information is collected underneath the Controllers key
54 47 data = data['Controllers']
55 # try: 48
56 # overview = status['Response Data']['System Overview'] 49 for controller in data:
57 # except KeyError: 50 response = controller['Response Data']
58 # pass 51 if response['Version']['Driver Name'] == 'megaraid_sas':
59 52 handle_megaraid_controller(response)
60 for controller in data: 53 elif response['Version']['Driver Name'] == 'mpt3sas':
61 response = controller['Response Data'] 54 handle_sas_controller(response)
62 if response['Version']['Driver Name'] == 'megaraid_sas': 55 except KeyError:
63 handle_megaraid_controller(response) 56 pass
64 elif response['Version']['Driver Name'] == 'mpt3sas': 57
65 handle_sas_controller(response)
66
67 # print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
68 # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
69 # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
70 # print_all_metrics(vd_metric_list)
71 print_all_metrics(metric_list) 58 print_all_metrics(metric_list)
72 59
73 60
74def handle_sas_controller(response): 61def handle_sas_controller(response):
75 pass 62 (controller_index, baselabel) = get_basic_controller_info(response)
63 add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK'))
64 add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
65 try:
66 # The number of physical disks is half of the number of items in this dict
67 # Every disk is listed twice - once for basic info, again for detailed info
68 add_metric('physical_drives', baselabel,
69 len(response['Physical Device Information'].keys()) / 2)
70 except AttributeError:
71 pass
72 # Split up string to not trigger CodeSpell issues
73 add_metric('temperature', baselabel,
74 int(response['HwCfg']['ROC temperature(Degree Celc' + 'ius)']))
75 for key, basic_disk_info in response['Physical Device Information'].items():
76 if 'Detailed Information' in key:
77 continue
78 create_metrcis_of_physical_drive(basic_disk_info[0],
79 response['Physical Device Information'], controller_index)
76 80
77 81
78def handle_megaraid_controller(response): 82def handle_megaraid_controller(response):
79 controller_index = response['Basics']['Controller'] 83 (controller_index, baselabel) = get_basic_controller_info(response)
80 baselabel = 'controller="{}"'.format(controller_index)
81 84
82 controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format( 85 # BBU Status Optimal value is 0 for cachevault and 32 for BBU
83 response['Basics']['Model'], 86 add_metric('battery_backup_healthy', baselabel,
84 response['Basics']['Serial Number'], 87 int(response['Status']['BBU Status'] in [0, 32]))
85 response['Version']['Firmware Version'],
86 )
87 add_metric('controller_info', controller_info_label, 1)
88
89 add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0))
90 add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) 88 add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
91 add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) 89 add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
92 add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) 90 add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
@@ -96,10 +94,13 @@ def handle_megaraid_controller(response):
96 add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) 94 add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
97 add_metric('scheduled_patrol_read', baselabel, 95 add_metric('scheduled_patrol_read', baselabel,
98 int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) 96 int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
97 add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)']))
99 98
100 time_difference_seconds = -1 99 time_difference_seconds = -1
101 system_time = parse(response['Basics'].get('Current System Date/time')) 100 system_time = datetime.strptime(response['Basics'].get('Current System Date/time'),
102 controller_time = parse(response['Basics'].get('Current Controller Date/Time')) 101 "%m/%d/%Y, %H:%M:%S")
102 controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'),
103 "%m/%d/%Y, %H:%M:%S")
103 if system_time and controller_time: 104 if system_time and controller_time:
104 time_difference_seconds = abs(system_time - controller_time).seconds 105 time_difference_seconds = abs(system_time - controller_time).seconds
105 add_metric('time_difference', baselabel, time_difference_seconds) 106 add_metric('time_difference', baselabel, time_difference_seconds)
@@ -112,58 +113,84 @@ def handle_megaraid_controller(response):
112 volume_group = vd_position.split('/')[1] 113 volume_group = vd_position.split('/')[1]
113 vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group, 114 vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
114 volume_group) 115 volume_group)
115 vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format( 116 vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}",state="{}"'.format(
116 virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE')) 117 str(virtual_drive.get('Name')).strip(),
118 str(virtual_drive.get('Cache')).strip(),
119 str(virtual_drive.get('TYPE')).strip(),
120 str(virtual_drive.get('State')).strip())
117 add_metric('vd_info', vd_info_label, 1) 121 add_metric('vd_info', vd_info_label, 1)
118 add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')]))
119 122
120 if response['Physical Drives'] > 0: 123 if response['Physical Drives'] > 0:
121 data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J')) 124 data = get_storcli_json('/cALL/eALL/sALL show all J')
122 drive_info = data['Controllers'][controller_index]['Response Data'] 125 drive_info = data['Controllers'][controller_index]['Response Data']
123 for physical_drive in response['PD LIST']: 126 for physical_drive in response['PD LIST']:
124 enclosure = physical_drive.get('EID:Slt').split(':')[0] 127 create_metrcis_of_physical_drive(physical_drive, drive_info, controller_index)
125 slot = physical_drive.get('EID:Slt').split(':')[1] 128
126 129
127 pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format( 130def get_basic_controller_info(response):
128 controller_index, enclosure, slot) 131 controller_index = response['Basics']['Controller']
129 pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format( 132 baselabel = 'controller="{}"'.format(controller_index)
130 physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'), 133
131 physical_drive.get('Model').strip()) 134 controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
132 135 str(response['Basics']['Model']).strip(),
133 drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( 136 str(response['Basics']['Serial Number']).strip(),
134 slot) 137 str(response['Version']['Firmware Version']).strip(),
135 try: 138 )
136 info = drive_info[drive_identifier + ' - Detailed Information'] 139 add_metric('controller_info', controller_info_label, 1)
137 state = info[drive_identifier + ' State'] 140
138 attributes = info[drive_identifier + ' Device attributes'] 141 return (controller_index, baselabel)
139 settings = info[drive_identifier + ' Policies/Settings'] 142
140 143
141 add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) 144def create_metrcis_of_physical_drive(physical_drive, detailed_info_array, controller_index):
142 add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count']) 145 enclosure = physical_drive.get('EID:Slt').split(':')[0]
143 add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count']) 146 slot = physical_drive.get('EID:Slt').split(':')[1]
144 add_metric('pd_predictive_errors_total', pd_baselabel, 147
145 state['Predictive Failure Count']) 148 pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(controller_index, enclosure,
146 add_metric('pd_smart_alerted', pd_baselabel, 149 slot)
147 int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) 150 pd_info_label = pd_baselabel + \
148 add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) 151 ',disk_id="{}",interface="{}",media="{}",model="{}",DG="{}"'.format(
149 add_metric('pd_device_speed_gbps', pd_baselabel, 152 str(physical_drive.get('DID')).strip(),
150 attributes['Device Speed'].split('.')[0]) 153 str(physical_drive.get('Intf')).strip(),
151 add_metric('pd_commissioned_spare', pd_baselabel, 154 str(physical_drive.get('Med')).strip(),
152 int(settings['Commissioned Spare'] == 'Yes')) 155 str(physical_drive.get('Model')).strip(),
153 add_metric('pd_emergency_spare', pd_baselabel, 156 str(physical_drive.get('DG')).strip())
154 int(settings['Emergency Spare'] == 'Yes')) 157
155 pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision']) 158 drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
156 except KeyError: 159 slot)
157 pass 160 if enclosure == ' ':
158 add_metric('pd_info', pd_info_label, 1) 161 drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot)
162 try:
163 info = detailed_info_array[drive_identifier + ' - Detailed Information']
164 state = info[drive_identifier + ' State']
165 attributes = info[drive_identifier + ' Device attributes']
166 settings = info[drive_identifier + ' Policies/Settings']
167
168 add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
169 add_metric('pd_media_errors', pd_baselabel, state['Media Error Count'])
170 add_metric('pd_other_errors', pd_baselabel, state['Other Error Count'])
171 add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count'])
172 add_metric('pd_smart_alerted', pd_baselabel,
173 int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
174 add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
175 add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0])
176 add_metric('pd_commissioned_spare', pd_baselabel,
177 int(settings['Commissioned Spare'] == 'Yes'))
178 add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes'))
179 pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'].strip())
180 except KeyError:
181 pass
182 add_metric('pd_info', pd_info_label, 1)
159 183
160 184
161def add_metric(name, labels, value): 185def add_metric(name, labels, value):
162 global metric_list 186 global metric_list
163 metric_list[name].append({ 187 try:
164 'labels': labels, 188 metric_list[name].append({
165 'value': value, 189 'labels': labels,
166 }) 190 'value': float(value),
191 })
192 except ValueError:
193 pass
167 194
168 195
169def print_all_metrics(metrics): 196def print_all_metrics(metrics):
@@ -184,8 +211,11 @@ def get_storcli_json(storcli_args):
184 proc = subprocess.Popen( 211 proc = subprocess.Popen(
185 storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 212 storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
186 output_json = proc.communicate()[0] 213 output_json = proc.communicate()[0]
214 data = json.loads(output_json.decode("utf-8"))
187 215
188 return output_json.decode("utf-8") 216 if data["Controllers"][0]["Command Status"]["Status"] != "Success":
217 SystemExit(1)
218 return data
189 219
190 220
191if __name__ == "__main__": 221if __name__ == "__main__":