From 6aa5cfba6c42ea6a9bd83787b2e595984d91f8d6 Mon Sep 17 00:00:00 2001
From: Christopher Blum <zeichenanonym@web.de>
Date: Tue, 18 Sep 2018 22:43:20 +0200
Subject: textfile example script rework (#1074)

* textfile smartmon.sh

Added functions to also parse megaraid disks.
Added parsing to also detect the grown_defects counters.

* textfile storcli.py

Reworked the example file to export lots more information about
megaraid attached controllers, VDs and PDs.

Signed-off-by: Christopher Blum <christopher.blum@profitbricks.com>
---
 text_collector_examples/smartmon.sh | 111 +++++++++-------
 text_collector_examples/storcli.py  | 259 +++++++++++++++++++++++-------------
 2 files changed, 227 insertions(+), 143 deletions(-)

(limited to 'text_collector_examples')

diff --git a/text_collector_examples/smartmon.sh b/text_collector_examples/smartmon.sh
index 9b0c7d4..7b873fa 100755
--- a/text_collector_examples/smartmon.sh
+++ b/text_collector_examples/smartmon.sh
@@ -7,7 +7,11 @@
 #       data in them than you'd think.
 #       http://arstechnica.com/civis/viewtopic.php?p=22062211
 
-parse_smartctl_attributes_awk="$(cat << 'SMARTCTLAWK'
+# Formatting done via shfmt -i 2
+# https://github.com/mvdan/sh
+
+parse_smartctl_attributes_awk="$(
+  cat <<'SMARTCTLAWK'
 $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
   gsub(/-/, "_");
   printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
@@ -18,7 +22,8 @@ $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
 SMARTCTLAWK
 )"
 
-smartmon_attrs="$(cat << 'SMARTMONATTRS'
+smartmon_attrs="$(
+  cat <<'SMARTMONATTRS'
 airflow_temperature_cel
 command_timeout
 current_pending_sector
@@ -64,63 +69,65 @@ parse_smartctl_attributes() {
   local disk_type="$2"
   local labels="disk=\"${disk}\",type=\"${disk_type}\""
   local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
-  sed 's/^ \+//g' \
-    | awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null \
-    | tr A-Z a-z \
-    | grep -E "(${smartmon_attrs})"
+  sed 's/^ \+//g' |
+    awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
+    tr A-Z a-z |
+    grep -E "(${smartmon_attrs})"
 }
 
 parse_smartctl_scsi_attributes() {
-    local disk="$1"
-    local disk_type="$2"
-    local labels="disk=\"${disk}\",type=\"${disk_type}\""
-    while read line ; do
-      attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
-      attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
-      case "${attr_type}" in
-        number_of_hours_powered_up_) power_on="$( echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
-        Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
-        Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
-        Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
-      esac
-    done
-    echo "power_on_hours_raw_value{"${labels}",smart_id=\"9\"} ${power_on}"
-    echo "temperature_celsius_raw_value{"${labels}",smart_id=\"194\"} ${temp_cel}"
-    echo "total_lbas_read_raw_value{"${labels}",smart_id=\"242\"} ${lbas_read}"
-    echo "power_cycle_count_raw_value{"${labels}",smart_id=\"12\"} ${power_cycle}"
+  local disk="$1"
+  local disk_type="$2"
+  local labels="disk=\"${disk}\",type=\"${disk_type}\""
+  while read line; do
+    attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
+    attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
+    case "${attr_type}" in
+    number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
+    Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
+    Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
+    Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
+    Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
+    esac
+  done
+  [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
+  [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
+  [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
+  [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
+  [ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}"
 }
 
 parse_smartctl_info() {
   local -i smart_available=0 smart_enabled=0 smart_healthy=0
   local disk="$1" disk_type="$2"
   local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id=''
-  while read line ; do
+  while read line; do
     info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')"
     info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')"
     case "${info_type}" in
-      Model_Family) model_family="${info_value}" ;;
-      Device_Model) device_model="${info_value}" ;;
-      Serial_Number) serial_number="${info_value}" ;;
-      Firmware_Version) fw_version="${info_value}" ;;
-      Vendor) vendor="${info_value}" ;;
-      Product) product="${info_value}" ;;
-      Revision) revision="${info_value}" ;;
-      Logical_Unit_id) lun_id="${info_value}" ;;
+    Model_Family) model_family="${info_value}" ;;
+    Device_Model) device_model="${info_value}" ;;
+    Serial_Number) serial_number="${info_value}" ;;
+    Firmware_Version) fw_version="${info_value}" ;;
+    Vendor) vendor="${info_value}" ;;
+    Product) product="${info_value}" ;;
+    Revision) revision="${info_value}" ;;
+    Logical_Unit_id) lun_id="${info_value}" ;;
     esac
-    if [[ "${info_type}" == 'SMART_support_is' ]] ; then
+    if [[ "${info_type}" == 'SMART_support_is' ]]; then
       case "${info_value:0:7}" in
-        Enabled) smart_enabled=1 ;;
-        Availab) smart_available=1 ;;
-        Unavail) smart_available=0 ;;
+      Enabled) smart_enabled=1 ;;
+      Availab) smart_available=1 ;;
+      Unavail) smart_available=0 ;;
       esac
     fi
-    if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]] ; then
+    if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then
       case "${info_value:0:6}" in
-        PASSED) smart_healthy=1 ;;
+      PASSED) smart_healthy=1 ;;
       esac
-    elif [[ "${info_type}" == 'SMART_Health_Status' ]] ; then
+    elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then
       case "${info_value:0:2}" in
-        OK) smart_healthy=1 ;;
+      OK) smart_healthy=1 ;;
       esac
     fi
   done
@@ -130,7 +137,8 @@ parse_smartctl_info() {
   echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
 }
 
-output_format_awk="$(cat << 'OUTPUTAWK'
+output_format_awk="$(
+  cat <<'OUTPUTAWK'
 BEGIN { v = "" }
 v != $1 {
   print "# HELP smartmon_" $1 " SMART metric " $1;
@@ -142,15 +150,15 @@ OUTPUTAWK
 )"
 
 format_output() {
-  sort \
-  | awk -F'{' "${output_format_awk}"
+  sort |
+    awk -F'{' "${output_format_awk}"
 }
 
-smartctl_version="$(/usr/sbin/smartctl -V | head -n1  | awk '$1 == "smartctl" {print $2}')"
+smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
 
 echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
 
-if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]] ; then
+if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
   exit
 fi
 
@@ -159,13 +167,18 @@ device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')
 for device in ${device_list}; do
   disk="$(echo ${device} | cut -f1 -d'|')"
   type="$(echo ${device} | cut -f2 -d'|')"
-  echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" $(TZ=UTC date '+%s')
+  echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')"
   # Get the SMART information and health
   /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
   # Get the SMART attributes
   case ${type} in
-    sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
-    scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
-    *) echo "disk type is not sat or scsi, ${type}"; exit ;;
+  sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
+  sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
+  scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
+  megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
+  *)
+    echo "disk type is not sat, scsi or megaraid but ${type}"
+    exit
+    ;;
   esac
 done | format_output
diff --git a/text_collector_examples/storcli.py b/text_collector_examples/storcli.py
index 3166290..48e2bba 100755
--- a/text_collector_examples/storcli.py
+++ b/text_collector_examples/storcli.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """
 Script to parse StorCLI's JSON output and expose
 MegaRAID health as Prometheus metrics.
@@ -19,110 +19,181 @@ import argparse
 import json
 import os
 import subprocess
+import shlex
+from dateutil.parser import parse
+import collections
+from enum import IntEnum
 
 DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
     Prometheus metrics."""
-VERSION = '0.0.1'
+VERSION = '0.0.2'
+
+storcli_path = ''
+metric_prefix = 'megaraid_'
+metric_list = {}
+metric_list = collections.defaultdict(list)
+
+
+class VD_State(IntEnum):
+    Optl = 0  # Optimal
+    Dgrd = 1  # Degraded
+    Pdgd = 2  # Partially Degraded
+    OfLn = 3  # Offline
+    Rec = 4  # Recovery
+    Cac = 5  # CacheCade
 
 
 def main(args):
     """ main """
-
-    # exporter variables
-    metric_prefix = 'megaraid_'
-    metric_controller_labels = '{{controller="{}", model="{}"}}'
-
-    data = json.loads(get_storcli_json(args.storcli_path))
-
-    # It appears that the data we need will always be present in the first
-    # item in the Controllers array
-    status = data['Controllers'][0]
-
-    metrics = {
-        'status_code': status['Command Status']['Status Code'],
-        'controllers': status['Response Data']['Number of Controllers'],
-    }
-
-    for name, value in metrics.iteritems():
-        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' ')))
-        print('# TYPE {}{} gauge'.format(metric_prefix, name))
-        print("{}{} {}".format(metric_prefix, name, value))
-
-    controller_info = []
-    controller_metrics = {}
-    overview = []
-
-    try:
-        overview = status['Response Data']['System Overview']
-    except KeyError:
-        pass
-
-    for controller in overview:
-        controller_index = controller['Ctl']
-        model = controller['Model']
-        controller_info.append(metric_controller_labels.format(controller_index, model))
-
-        controller_metrics = {
-            # FIXME: Parse dimmer switch options
-            # 'dimmer_switch':          controller['DS'],
-
-            'battery_backup_healthy':   int(controller['BBU'] == 'Opt'),
-            'degraded':                 int(controller['Hlth'] == 'Dgd'),
-            'drive_groups':             controller['DGs'],
-            'emergency_hot_spare':      int(controller['EHS'] == 'Y'),
-            'failed':                   int(controller['Hlth'] == 'Fld'),
-            'healthy':                  int(controller['Hlth'] == 'Opt'),
-            'physical_drives':          controller['PDs'],
-            'ports':                    controller['Ports'],
-            'scheduled_patrol_read':    int(controller['sPR'] == 'On'),
-            'virtual_drives':           controller['VDs'],
-
-            # Reverse StorCLI's logic to make metrics consistent
-            'drive_groups_optimal':     int(controller['DNOpt'] == 0),
-            'virtual_drives_optimal':   int(controller['VNOpt'] == 0),
-            }
-
-    for name, value in controller_metrics.iteritems():
-        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' ')))
-        print('# TYPE {}{} gauge'.format(metric_prefix, name))
-        print('{}{}{{controller="{}"}} {}'.format(metric_prefix, name,
-                                                  controller_index, value))
-
-    if controller_info:
-        print('# HELP {}{} MegaRAID controller info'.format(metric_prefix, 'controller_info'))
-        print('# TYPE {}{} gauge'.format(metric_prefix, 'controller_info'))
-    for labels in controller_info:
-        print('{}{}{} {}'.format(metric_prefix, 'controller_info', labels, 1))
-
-
-def get_storcli_json(storcli_path):
+    global storcli_path
+    storcli_path = args.storcli_path
+    data = json.loads(get_storcli_json('/cALL show all J'))
+
+    # All the information is collected underneath the Controllers key
+    data = data['Controllers']
+
+    # try:
+    #     overview = status['Response Data']['System Overview']
+    # except KeyError:
+    #     pass
+
+    for controller in data:
+        response = controller['Response Data']
+        if response['Version']['Driver Name'] == 'megaraid_sas':
+            handle_megaraid_controller(response)
+        elif response['Version']['Driver Name'] == 'mpt3sas':
+            handle_sas_controller(response)
+
+    # print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
+    # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
+    # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
+    # print_all_metrics(vd_metric_list)
+    print_all_metrics(metric_list)
+
+
+def handle_sas_controller(response):
+    pass
+
+
+def handle_megaraid_controller(response):
+    controller_index = response['Basics']['Controller']
+    baselabel = 'controller="{}"'.format(controller_index)
+
+    controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
+        response['Basics']['Model'],
+        response['Basics']['Serial Number'],
+        response['Version']['Firmware Version'],
+    )
+    add_metric('controller_info', controller_info_label, 1)
+
+    add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0))
+    add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
+    add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
+    add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
+    add_metric('drive_groups', baselabel, response['Drive Groups'])
+    add_metric('virtual_drives', baselabel, response['Virtual Drives'])
+    add_metric('physical_drives', baselabel, response['Physical Drives'])
+    add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
+    add_metric('scheduled_patrol_read', baselabel,
+               int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
+
+    time_difference_seconds = -1
+    system_time = parse(response['Basics'].get('Current System Date/time'))
+    controller_time = parse(response['Basics'].get('Current Controller Date/Time'))
+    if system_time and controller_time:
+        time_difference_seconds = abs(system_time - controller_time).seconds
+        add_metric('time_difference', baselabel, time_difference_seconds)
+
+    for virtual_drive in response['VD LIST']:
+        vd_position = virtual_drive.get('DG/VD')
+        drive_group, volume_group = -1, -1
+        if vd_position:
+            drive_group = vd_position.split('/')[0]
+            volume_group = vd_position.split('/')[1]
+        vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
+                                                                volume_group)
+        vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format(
+            virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE'))
+        add_metric('vd_info', vd_info_label, 1)
+        add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')]))
+
+    if response['Physical Drives'] > 0:
+        data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J'))
+        drive_info = data['Controllers'][controller_index]['Response Data']
+    for physical_drive in response['PD LIST']:
+        enclosure = physical_drive.get('EID:Slt').split(':')[0]
+        slot = physical_drive.get('EID:Slt').split(':')[1]
+
+        pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(
+            controller_index, enclosure, slot)
+        pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format(
+            physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'),
+            physical_drive.get('Model').strip())
+
+        drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
+            slot)
+        try:
+            info = drive_info[drive_identifier + ' - Detailed Information']
+            state = info[drive_identifier + ' State']
+            attributes = info[drive_identifier + ' Device attributes']
+            settings = info[drive_identifier + ' Policies/Settings']
+
+            add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
+            add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count'])
+            add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count'])
+            add_metric('pd_predictive_errors_total', pd_baselabel,
+                       state['Predictive Failure Count'])
+            add_metric('pd_smart_alerted', pd_baselabel,
+                       int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
+            add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
+            add_metric('pd_device_speed_gbps', pd_baselabel,
+                       attributes['Device Speed'].split('.')[0])
+            add_metric('pd_commissioned_spare', pd_baselabel,
+                       int(settings['Commissioned Spare'] == 'Yes'))
+            add_metric('pd_emergency_spare', pd_baselabel,
+                       int(settings['Emergency Spare'] == 'Yes'))
+            pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'])
+        except KeyError:
+            pass
+        add_metric('pd_info', pd_info_label, 1)
+
+
+def add_metric(name, labels, value):
+    global metric_list
+    metric_list[name].append({
+        'labels': labels,
+        'value': value,
+    })
+
+
+def print_all_metrics(metrics):
+    for metric, measurements in metrics.items():
+        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, metric, metric.replace('_', ' ')))
+        print('# TYPE {}{} gauge'.format(metric_prefix, metric))
+        for measurement in measurements:
+            print('{}{}{} {}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}',
+                                     measurement['value']))
+
+
+def get_storcli_json(storcli_args):
     """Get storcli output in JSON format."""
+    # Check if storcli is installed and executable
+    if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)):
+        SystemExit(1)
+    storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args)
+    proc = subprocess.Popen(
+        storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output_json = proc.communicate()[0]
+
+    return output_json.decode("utf-8")
 
-    # Check if storcli is installed
-    if os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK):
-        storcli_cmd = [storcli_path, 'show', 'all', 'J']
-        proc = subprocess.Popen(storcli_cmd, shell=False,
-                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output_json = proc.communicate()[0]
-    else:
-        # Create an empty dummy-JSON where storcli not installed.
-        dummy_json = {"Controllers":[{
-            "Command Status": {"Status Code": 0, "Status": "Success",
-                               "Description": "None"},
-            "Response Data": {"Number of Controllers": 0}}]}
-        output_json = json.dumps(dummy_json)
-
-    return output_json
 
 if __name__ == "__main__":
-    PARSER = argparse.ArgumentParser(description=DESCRIPTION,
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    PARSER.add_argument('--storcli_path',
-                        default='/opt/MegaRAID/storcli/storcli64',
-                        help='path to StorCLi binary')
-    PARSER.add_argument('--version',
-                        action='version',
-                        version='%(prog)s {}'.format(VERSION))
+    PARSER = argparse.ArgumentParser(
+        description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    PARSER.add_argument(
+        '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary')
+    PARSER.add_argument('--version', action='version', version='%(prog)s {}'.format(VERSION))
     ARGS = PARSER.parse_args()
 
     main(ARGS)
-- 
cgit v1.2.3