aboutsummaryrefslogtreecommitdiff
path: root/text_collector_examples/nvme_metrics.sh
blob: 5cc23cf874ca882a2aaf987eb2e7e4464426f2c7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env bash
set -eu

# Dependencies: nvme-cli, jq (packages)
# Based on code from
# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh
# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp
# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh
#
# Author: Henk <henk@wearespindle.com>

# Check if we are root
if [ "$EUID" -ne 0 ]; then
  echo "${0##*/}: Please run as root!" >&2
  exit 1
fi

# Check if programs are installed
if ! command -v nvme >/dev/null 2>&1; then
  echo "${0##*/}: nvme is not installed. Aborting." >&2
  exit 1
fi

output_format_awk="$(
  cat <<'OUTPUTAWK'
BEGIN { v = "" }
v != $1 {
  print "# HELP nvme_" $1 " SMART metric " $1;
  if ($1 ~ /_total$/)
    print "# TYPE nvme_" $1 " counter";
  else
    print "# TYPE nvme_" $1 " gauge";
  v = $1
}
{print "nvme_" $0}
OUTPUTAWK
)"

format_output() {
  sort | awk -F'{' "${output_format_awk}"
}

# Get the nvme-cli version
nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output

# Get devices
device_list="$(nvme list | awk '/^\/dev/{print $1}')"

# Loop through the NVMe devices
for device in ${device_list}; do
  json_check="$(nvme smart-log -o json "${device}")"
  disk="$(echo "${device}" | cut -c6-10)"

  # The temperature value in JSON is in Kelvin, we want Celsius
  value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
  echo "temperature_celcius{device=\"${disk}\"} ${value_temperature}"

  value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
  echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"

  value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
  echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"

  value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
  echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"

  value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
  echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"

  value_media_errors="$(echo "$json_check" | jq '.media_errors')"
  echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"

  value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
  echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"

  value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
  echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"

  value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
  echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"

  value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
  echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"

  value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
  echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"

  value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
  echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"

  value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
  echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"

  value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
  echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
done | format_output