diff options
author | Johannes 'fish' Ziemke <github@freigeist.org> | 2019-08-03 12:14:51 +0200 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2019-08-03 12:14:51 +0200 |
commit | fc73586c971225037aa09b5462031b9694278c74 (patch) | |
tree | 153f18c687e0c89426de3814294dcac261484c52 /text_collector_examples | |
parent | 0b710bb0c95c32402477c7df0ad74c2e4f13c4d9 (diff) | |
download | prometheus_node_collector-fc73586c971225037aa09b5462031b9694278c74.tar.bz2 prometheus_node_collector-fc73586c971225037aa09b5462031b9694278c74.tar.xz prometheus_node_collector-fc73586c971225037aa09b5462031b9694278c74.zip |
Remove text_collector_examples/ (#1441)
* Remove text_collector_examples/
These have been moved to https://github.com/prometheus-community/node-exporter-textfile-collector-scripts
This closes #1077
Signed-off-by: Johannes 'fish' Ziemke <github@freigeist.org>
Diffstat (limited to 'text_collector_examples')
-rw-r--r-- | text_collector_examples/README.md | 16 | ||||
-rwxr-xr-x | text_collector_examples/apt.sh | 32 | ||||
-rwxr-xr-x | text_collector_examples/btrfs_stats.py | 112 | ||||
-rwxr-xr-x | text_collector_examples/deleted_libraries.py | 70 | ||||
-rwxr-xr-x | text_collector_examples/directory-size.sh | 15 | ||||
-rwxr-xr-x | text_collector_examples/inotify-instances | 141 | ||||
-rwxr-xr-x | text_collector_examples/ipmitool | 89 | ||||
-rwxr-xr-x | text_collector_examples/md_info.sh | 56 | ||||
-rwxr-xr-x | text_collector_examples/md_info_detail.sh | 87 | ||||
-rwxr-xr-x | text_collector_examples/mellanox_hca_temp | 59 | ||||
-rwxr-xr-x | text_collector_examples/multipathd_info | 9 | ||||
-rwxr-xr-x | text_collector_examples/ntpd_metrics.py | 122 | ||||
-rwxr-xr-x | text_collector_examples/nvme_metrics.sh | 97 | ||||
-rwxr-xr-x | text_collector_examples/pacman.sh | 33 | ||||
-rwxr-xr-x | text_collector_examples/smartmon.py | 378 | ||||
-rwxr-xr-x | text_collector_examples/smartmon.sh | 194 | ||||
-rwxr-xr-x | text_collector_examples/storcli.py | 242 | ||||
-rwxr-xr-x | text_collector_examples/yum.sh | 18 |
18 files changed, 2 insertions, 1768 deletions
diff --git a/text_collector_examples/README.md b/text_collector_examples/README.md index a26592f..3794261 100644 --- a/text_collector_examples/README.md +++ b/text_collector_examples/README.md | |||
@@ -1,16 +1,4 @@ | |||
1 | # Text collector example scripts | 1 | # Text collector example scripts |
2 | 2 | ||
3 | These scripts are examples to be used with the Node Exporter Textfile | 3 | The scripts have been moved to |
4 | Collector. | 4 | https://github.com/prometheus-community/node-exporter-textfile-collector-scripts |
5 | |||
6 | To use these scripts, we recommend using a `sponge` to atomically write the output. | ||
7 | |||
8 | <collector_script> | sponge <output_file> | ||
9 | |||
10 | Sponge comes from [moreutils](https://joeyh.name/code/moreutils/) | ||
11 | * [brew install moreutils](http://brewformulas.org/Moreutil) | ||
12 | * [apt install moreutils](https://packages.debian.org/search?keywords=moreutils) | ||
13 | * [pkg install moreutils](https://www.freshports.org/sysutils/moreutils/) | ||
14 | |||
15 | For more information see: | ||
16 | https://github.com/prometheus/node_exporter#textfile-collector | ||
diff --git a/text_collector_examples/apt.sh b/text_collector_examples/apt.sh deleted file mode 100755 index 171bb0a..0000000 --- a/text_collector_examples/apt.sh +++ /dev/null | |||
@@ -1,32 +0,0 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Description: Expose metrics from apt updates. | ||
4 | # | ||
5 | # Author: Ben Kochie <superq@gmail.com> | ||
6 | |||
7 | upgrades="$(/usr/bin/apt-get --just-print upgrade \ | ||
8 | | /usr/bin/awk -F'[()]' \ | ||
9 | '/^Inst/ { sub("^[^ ]+ ", "", $2); gsub(" ","",$2); | ||
10 | sub("\\[", " ", $2); sub("\\]", "", $2); print $2 }' \ | ||
11 | | /usr/bin/sort \ | ||
12 | | /usr/bin/uniq -c \ | ||
13 | | awk '{ gsub(/\\\\/, "\\\\", $2); gsub(/\"/, "\\\"", $2); | ||
14 | gsub(/\[/, "", $3); gsub(/\]/, "", $3); | ||
15 | print "apt_upgrades_pending{origin=\"" $2 "\",arch=\"" $3 "\"} " $1}' | ||
16 | )" | ||
17 | |||
18 | echo '# HELP apt_upgrades_pending Apt package pending updates by origin.' | ||
19 | echo '# TYPE apt_upgrades_pending gauge' | ||
20 | if [[ -n "${upgrades}" ]] ; then | ||
21 | echo "${upgrades}" | ||
22 | else | ||
23 | echo 'apt_upgrades_pending{origin="",arch=""} 0' | ||
24 | fi | ||
25 | |||
26 | echo '# HELP node_reboot_required Node reboot is required for software updates.' | ||
27 | echo '# TYPE node_reboot_required gauge' | ||
28 | if [[ -f '/run/reboot-required' ]] ; then | ||
29 | echo 'node_reboot_required 1' | ||
30 | else | ||
31 | echo 'node_reboot_required 0' | ||
32 | fi | ||
diff --git a/text_collector_examples/btrfs_stats.py b/text_collector_examples/btrfs_stats.py deleted file mode 100755 index 68e89a8..0000000 --- a/text_collector_examples/btrfs_stats.py +++ /dev/null | |||
@@ -1,112 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | |||
3 | # Collect per-device btrfs filesystem errors. | ||
4 | # Designed to work on Debian and Centos 6 (with python2.6). | ||
5 | |||
6 | import collections | ||
7 | import glob | ||
8 | import os | ||
9 | import re | ||
10 | import subprocess | ||
11 | |||
12 | def get_btrfs_mount_points(): | ||
13 | """List all btrfs mount points. | ||
14 | |||
15 | Yields: | ||
16 | (string) filesystem mount points. | ||
17 | """ | ||
18 | with open("/proc/mounts") as f: | ||
19 | for line in f: | ||
20 | parts = line.split() | ||
21 | if parts[2] == "btrfs": | ||
22 | yield parts[1] | ||
23 | |||
24 | def get_btrfs_errors(mountpoint): | ||
25 | """Get per-device errors for a btrfs mount point. | ||
26 | |||
27 | Args: | ||
28 | mountpoint: (string) path to a mount point. | ||
29 | |||
30 | Yields: | ||
31 | (device, error_type, error_count) tuples, where: | ||
32 | device: (string) path to block device. | ||
33 | error_type: (string) type of btrfs error. | ||
34 | error_count: (int) number of btrfs errors of a given type. | ||
35 | """ | ||
36 | p = subprocess.Popen(["btrfs", "device", "stats", mountpoint], | ||
37 | stdout=subprocess.PIPE) | ||
38 | (stdout, stderr) = p.communicate() | ||
39 | if p.returncode != 0: | ||
40 | raise RuntimeError("btrfs returned exit code %d" % p.returncode) | ||
41 | for line in stdout.splitlines(): | ||
42 | if line == '': | ||
43 | continue | ||
44 | # Sample line: | ||
45 | # [/dev/vdb1].flush_io_errs 0 | ||
46 | m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8")) | ||
47 | if not m: | ||
48 | raise RuntimeError("unexpected output from btrfs: '%s'" % line) | ||
49 | yield m.group(1), m.group(2), int(m.group(3)) | ||
50 | |||
51 | def btrfs_error_metrics(): | ||
52 | """Collect btrfs error metrics. | ||
53 | |||
54 | Returns: | ||
55 | a list of strings to be exposed as Prometheus metrics. | ||
56 | """ | ||
57 | metric = "node_btrfs_errors_total" | ||
58 | contents = [ | ||
59 | "# TYPE %s counter" % metric, | ||
60 | "# HELP %s number of btrfs errors" % metric, | ||
61 | ] | ||
62 | errors_by_device = collections.defaultdict(dict) | ||
63 | for mountpoint in get_btrfs_mount_points(): | ||
64 | for device, error_type, error_count in get_btrfs_errors(mountpoint): | ||
65 | contents.append( | ||
66 | '%s{mountpoint="%s",device="%s",type="%s"} %d' % | ||
67 | (metric, mountpoint, device, error_type, error_count)) | ||
68 | |||
69 | if len(contents) > 2: | ||
70 | # return metrics if there are actual btrfs filesystems found | ||
71 | # (i.e. `contents` contains more than just TYPE and HELP). | ||
72 | return contents | ||
73 | |||
74 | def btrfs_allocation_metrics(): | ||
75 | """Collect btrfs allocation metrics. | ||
76 | |||
77 | Returns: | ||
78 | a list of strings to be exposed as Prometheus metrics. | ||
79 | """ | ||
80 | prefix = 'node_btrfs_allocation' | ||
81 | metric_to_filename = { | ||
82 | 'size_bytes': 'total_bytes', | ||
83 | 'used_bytes': 'bytes_used', | ||
84 | 'reserved_bytes': 'bytes_reserved', | ||
85 | 'pinned_bytes': 'bytes_pinned', | ||
86 | 'disk_size_bytes': 'disk_total', | ||
87 | 'disk_used_bytes': 'disk_used', | ||
88 | } | ||
89 | contents = [] | ||
90 | for m, f in metric_to_filename.items(): | ||
91 | contents += [ | ||
92 | "# TYPE %s_%s gauge" % (prefix, m), | ||
93 | "# HELP %s_%s btrfs allocation data (%s)" % (prefix, m, f), | ||
94 | ] | ||
95 | |||
96 | for alloc in glob.glob("/sys/fs/btrfs/*/allocation"): | ||
97 | fs = alloc.split('/')[4] | ||
98 | for type_ in ('data', 'metadata', 'system'): | ||
99 | for m, f in metric_to_filename.items(): | ||
100 | filename = os.path.join(alloc, type_, f) | ||
101 | with open(filename) as f: | ||
102 | value = int(f.read().strip()) | ||
103 | contents.append('%s_%s{fs="%s",type="%s"} %d' % ( | ||
104 | prefix, m, fs, type_, value)) | ||
105 | if len(contents) > 2*len(metric_to_filename): | ||
106 | return contents | ||
107 | |||
108 | if __name__ == "__main__": | ||
109 | contents = ((btrfs_error_metrics() or []) + | ||
110 | (btrfs_allocation_metrics() or [])) | ||
111 | |||
112 | print("\n".join(contents)) | ||
diff --git a/text_collector_examples/deleted_libraries.py b/text_collector_examples/deleted_libraries.py deleted file mode 100755 index 1354d80..0000000 --- a/text_collector_examples/deleted_libraries.py +++ /dev/null | |||
@@ -1,70 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | """ | ||
3 | Script to count the number of deleted libraries that are linked by running | ||
4 | processes and expose a summary as Prometheus metrics. | ||
5 | |||
6 | The aim is to discover processes that are still using libraries that have since | ||
7 | been updated, perhaps due security vulnerabilities. | ||
8 | """ | ||
9 | |||
10 | import errno | ||
11 | import glob | ||
12 | import os | ||
13 | import sys | ||
14 | |||
15 | |||
16 | def main(): | ||
17 | processes_linking_deleted_libraries = {} | ||
18 | |||
19 | for path in glob.glob('/proc/*/maps'): | ||
20 | try: | ||
21 | with open(path, 'rb') as file: | ||
22 | for line in file: | ||
23 | part = line.decode().strip().split() | ||
24 | |||
25 | if len(part) == 7: | ||
26 | library = part[5] | ||
27 | comment = part[6] | ||
28 | |||
29 | if '/lib/' in library and '(deleted)' in comment: | ||
30 | if path not in processes_linking_deleted_libraries: | ||
31 | processes_linking_deleted_libraries[path] = {} | ||
32 | |||
33 | if library in processes_linking_deleted_libraries[path]: | ||
34 | processes_linking_deleted_libraries[path][library] += 1 | ||
35 | else: | ||
36 | processes_linking_deleted_libraries[path][library] = 1 | ||
37 | except EnvironmentError as e: | ||
38 | # Ignore non-existent files, since the files may have changed since | ||
39 | # we globbed. | ||
40 | if e.errno != errno.ENOENT: | ||
41 | sys.exit('Failed to open file: {0}'.format(path)) | ||
42 | |||
43 | num_processes_per_library = {} | ||
44 | |||
45 | for process, library_count in processes_linking_deleted_libraries.items(): | ||
46 | libraries_seen = set() | ||
47 | for library, count in library_count.items(): | ||
48 | if library in libraries_seen: | ||
49 | continue | ||
50 | |||
51 | libraries_seen.add(library) | ||
52 | if library in num_processes_per_library: | ||
53 | num_processes_per_library[library] += 1 | ||
54 | else: | ||
55 | num_processes_per_library[library] = 1 | ||
56 | |||
57 | metric_name = 'node_processes_linking_deleted_libraries' | ||
58 | description = 'Count of running processes that link a deleted library' | ||
59 | print('# HELP {0} {1}'.format(metric_name, description)) | ||
60 | print('# TYPE {0} gauge'.format(metric_name)) | ||
61 | |||
62 | for library, count in num_processes_per_library.items(): | ||
63 | dir_path, basename = os.path.split(library) | ||
64 | basename = basename.replace('"', '\\"') | ||
65 | dir_path = dir_path.replace('"', '\\"') | ||
66 | print('{0}{{library_path="{1}", library_name="{2}"}} {3}'.format(metric_name, dir_path, basename, count)) | ||
67 | |||
68 | |||
69 | if __name__ == "__main__": | ||
70 | main() | ||
diff --git a/text_collector_examples/directory-size.sh b/text_collector_examples/directory-size.sh deleted file mode 100755 index 4aab71d..0000000 --- a/text_collector_examples/directory-size.sh +++ /dev/null | |||
@@ -1,15 +0,0 @@ | |||
1 | #!/bin/sh | ||
2 | # | ||
3 | # Expose directory usage metrics, passed as an argument. | ||
4 | # | ||
5 | # Usage: add this to crontab: | ||
6 | # | ||
7 | # */5 * * * * prometheus directory-size.sh /var/lib/prometheus | sponge /var/lib/node_exporter/directory_size.prom | ||
8 | # | ||
9 | # sed pattern taken from https://www.robustperception.io/monitoring-directory-sizes-with-the-textfile-collector/ | ||
10 | # | ||
11 | # Author: Antoine Beaupré <anarcat@debian.org> | ||
12 | echo "# HELP node_directory_size_bytes Disk space used by some directories" | ||
13 | echo "# TYPE node_directory_size_bytes gauge" | ||
14 | du --block-size=1 --summarize "$@" \ | ||
15 | | sed -ne 's/\\/\\\\/;s/"/\\"/g;s/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p' | ||
diff --git a/text_collector_examples/inotify-instances b/text_collector_examples/inotify-instances deleted file mode 100755 index ada74d4..0000000 --- a/text_collector_examples/inotify-instances +++ /dev/null | |||
@@ -1,141 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | |||
3 | """ | ||
4 | Expose Linux inotify(7) instance resource consumption. | ||
5 | |||
6 | Operational properties: | ||
7 | |||
8 | - This script may be invoked as an unprivileged user; in this case, metrics | ||
9 | will only be exposed for processes owned by that unprivileged user. | ||
10 | |||
11 | - No metrics will be exposed for processes that do not hold any inotify fds. | ||
12 | |||
13 | Requires Python 3.5 or later. | ||
14 | """ | ||
15 | |||
16 | import collections | ||
17 | import os | ||
18 | import sys | ||
19 | |||
20 | |||
21 | class Error(Exception): | ||
22 | pass | ||
23 | |||
24 | |||
25 | class _PIDGoneError(Error): | ||
26 | pass | ||
27 | |||
28 | |||
29 | _Process = collections.namedtuple( | ||
30 | "Process", ["pid", "uid", "command", "inotify_instances"]) | ||
31 | |||
32 | |||
33 | def _read_bytes(name): | ||
34 | with open(name, mode='rb') as f: | ||
35 | return f.read() | ||
36 | |||
37 | |||
38 | def _pids(): | ||
39 | for n in os.listdir("/proc"): | ||
40 | if not n.isdigit(): | ||
41 | continue | ||
42 | yield int(n) | ||
43 | |||
44 | |||
45 | def _pid_uid(pid): | ||
46 | try: | ||
47 | s = os.stat("/proc/{}".format(pid)) | ||
48 | except FileNotFoundError: | ||
49 | raise _PIDGoneError() | ||
50 | return s.st_uid | ||
51 | |||
52 | |||
53 | def _pid_command(pid): | ||
54 | # Avoid GNU ps(1) for it truncates comm. | ||
55 | # https://bugs.launchpad.net/ubuntu/+source/procps/+bug/295876/comments/3 | ||
56 | try: | ||
57 | cmdline = _read_bytes("/proc/{}/cmdline".format(pid)) | ||
58 | except FileNotFoundError: | ||
59 | raise _PIDGoneError() | ||
60 | |||
61 | if not len(cmdline): | ||
62 | return "<zombie>" | ||
63 | |||
64 | try: | ||
65 | prog = cmdline[0:cmdline.index(0x00)] | ||
66 | except ValueError: | ||
67 | prog = cmdline | ||
68 | return os.path.basename(prog).decode(encoding="ascii", | ||
69 | errors="surrogateescape") | ||
70 | |||
71 | |||
72 | def _pid_inotify_instances(pid): | ||
73 | instances = 0 | ||
74 | try: | ||
75 | for fd in os.listdir("/proc/{}/fd".format(pid)): | ||
76 | try: | ||
77 | target = os.readlink("/proc/{}/fd/{}".format(pid, fd)) | ||
78 | except FileNotFoundError: | ||
79 | continue | ||
80 | if target == "anon_inode:inotify": | ||
81 | instances += 1 | ||
82 | except FileNotFoundError: | ||
83 | raise _PIDGoneError() | ||
84 | return instances | ||
85 | |||
86 | |||
87 | def _get_processes(): | ||
88 | for p in _pids(): | ||
89 | try: | ||
90 | yield _Process(p, _pid_uid(p), _pid_command(p), | ||
91 | _pid_inotify_instances(p)) | ||
92 | except (PermissionError, _PIDGoneError): | ||
93 | continue | ||
94 | |||
95 | |||
96 | def _get_processes_nontrivial(): | ||
97 | return (p for p in _get_processes() if p.inotify_instances > 0) | ||
98 | |||
99 | |||
100 | def _format_gauge_metric(metric_name, metric_help, samples, | ||
101 | value_func, tags_func=None, stream=sys.stdout): | ||
102 | |||
103 | def _println(*args, **kwargs): | ||
104 | if "file" not in kwargs: | ||
105 | kwargs["file"] = stream | ||
106 | print(*args, **kwargs) | ||
107 | |||
108 | def _print(*args, **kwargs): | ||
109 | if "end" not in kwargs: | ||
110 | kwargs["end"] = "" | ||
111 | _println(*args, **kwargs) | ||
112 | |||
113 | _println("# HELP {} {}".format(metric_name, metric_help)) | ||
114 | _println("# TYPE {} gauge".format(metric_name)) | ||
115 | |||
116 | for s in samples: | ||
117 | value = value_func(s) | ||
118 | tags = None | ||
119 | if tags_func: | ||
120 | tags = tags_func(s) | ||
121 | |||
122 | _print(metric_name) | ||
123 | if tags: | ||
124 | _print("{") | ||
125 | _print(",".join(["{}=\"{}\"".format(k, v) for k, v in tags])) | ||
126 | _print("}") | ||
127 | _print(" ") | ||
128 | _println(value) | ||
129 | |||
130 | |||
131 | def main(args_unused=None): | ||
132 | _format_gauge_metric( | ||
133 | "inotify_instances", | ||
134 | "Total number of inotify instances held open by a process.", | ||
135 | _get_processes_nontrivial(), | ||
136 | lambda s: s.inotify_instances, | ||
137 | lambda s: [("pid", s.pid), ("uid", s.uid), ("command", s.command)]) | ||
138 | |||
139 | |||
140 | if __name__ == "__main__": | ||
141 | sys.exit(main(sys.argv)) | ||
diff --git a/text_collector_examples/ipmitool b/text_collector_examples/ipmitool deleted file mode 100755 index e373b95..0000000 --- a/text_collector_examples/ipmitool +++ /dev/null | |||
@@ -1,89 +0,0 @@ | |||
1 | #!/usr/bin/awk -f | ||
2 | |||
3 | # | ||
4 | # Converts output of `ipmitool sensor` to prometheus format. | ||
5 | # | ||
6 | # With GNU awk: | ||
7 | # ipmitool sensor | ./ipmitool > ipmitool.prom | ||
8 | # | ||
9 | # With BSD awk: | ||
10 | # ipmitool sensor | awk -f ./ipmitool > ipmitool.prom | ||
11 | # | ||
12 | |||
13 | function export(values, name) { | ||
14 | if (values["metric_count"] < 1) { | ||
15 | return | ||
16 | } | ||
17 | delete values["metric_count"] | ||
18 | |||
19 | printf("# HELP %s%s %s sensor reading from ipmitool\n", namespace, name, help[name]); | ||
20 | printf("# TYPE %s%s gauge\n", namespace, name); | ||
21 | for (sensor in values) { | ||
22 | printf("%s%s{sensor=\"%s\"} %f\n", namespace, name, sensor, values[sensor]); | ||
23 | } | ||
24 | } | ||
25 | |||
26 | # Fields are Bar separated, with space padding. | ||
27 | BEGIN { | ||
28 | FS = "[ ]*[|][ ]*"; | ||
29 | namespace = "node_ipmi_"; | ||
30 | |||
31 | # Friendly description of the type of sensor for HELP. | ||
32 | help["temperature_celsius"] = "Temperature"; | ||
33 | help["volts"] = "Voltage"; | ||
34 | help["power_watts"] = "Power"; | ||
35 | help["speed_rpm"] = "Fan"; | ||
36 | help["status"] = "Chassis status"; | ||
37 | |||
38 | temperature_celsius["metric_count"] = 0; | ||
39 | volts["metric_count"] = 0; | ||
40 | power_watts["metric_count"] = 0; | ||
41 | speed_rpm["metric_count"] = 0; | ||
42 | status["metric_count"] = 0; | ||
43 | } | ||
44 | |||
45 | # Not a valid line. | ||
46 | { | ||
47 | if (NF < 3) { | ||
48 | next | ||
49 | } | ||
50 | } | ||
51 | |||
52 | # $2 is value field. | ||
53 | $2 ~ /na/ { | ||
54 | next | ||
55 | } | ||
56 | |||
57 | # $3 is type field. | ||
58 | $3 ~ /degrees C/ { | ||
59 | temperature_celsius[$1] = $2; | ||
60 | temperature_celsius["metric_count"]++; | ||
61 | } | ||
62 | |||
63 | $3 ~ /Volts/ { | ||
64 | volts[$1] = $2; | ||
65 | volts["metric_count"]++; | ||
66 | } | ||
67 | |||
68 | $3 ~ /Watts/ { | ||
69 | power_watts[$1] = $2; | ||
70 | power_watts["metric_count"]++; | ||
71 | } | ||
72 | |||
73 | $3 ~ /RPM/ { | ||
74 | speed_rpm[$1] = $2; | ||
75 | speed_rpm["metric_count"]++; | ||
76 | } | ||
77 | |||
78 | $3 ~ /discrete/ { | ||
79 | status[$1] = sprintf("%d", substr($2,3,2)); | ||
80 | status["metric_count"]++; | ||
81 | } | ||
82 | |||
83 | END { | ||
84 | export(temperature_celsius, "temperature_celsius"); | ||
85 | export(volts, "volts"); | ||
86 | export(power_watts, "power_watts"); | ||
87 | export(speed_rpm, "speed_rpm"); | ||
88 | export(status, "status"); | ||
89 | } | ||
diff --git a/text_collector_examples/md_info.sh b/text_collector_examples/md_info.sh deleted file mode 100755 index c89f10f..0000000 --- a/text_collector_examples/md_info.sh +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | #!/usr/bin/env bash | ||
2 | set -eu | ||
3 | |||
4 | for MD_DEVICE in /dev/md/*; do | ||
5 | # Subshell to avoid eval'd variables from leaking between iterations | ||
6 | ( | ||
7 | # Resolve symlink to discover device, e.g. /dev/md127 | ||
8 | MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}") | ||
9 | |||
10 | # Remove /dev/ prefix | ||
11 | MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/} | ||
12 | MD_DEVICE=${MD_DEVICE#/dev/md/} | ||
13 | |||
14 | # Query sysfs for info about md device | ||
15 | SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md" | ||
16 | MD_LAYOUT=$(cat "${SYSFS_BASE}/layout") | ||
17 | MD_LEVEL=$(cat "${SYSFS_BASE}/level") | ||
18 | MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version") | ||
19 | MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks") | ||
20 | |||
21 | # Remove 'raid' prefix from RAID level | ||
22 | MD_LEVEL=${MD_LEVEL#raid} | ||
23 | |||
24 | # Output disk metrics | ||
25 | for RAID_DISK in ${SYSFS_BASE}/rd[0-9]*; do | ||
26 | DISK=$(readlink -f "${RAID_DISK}/block") | ||
27 | DISK_DEVICE=$(basename "${DISK}") | ||
28 | RAID_DISK_DEVICE=$(basename "${RAID_DISK}") | ||
29 | RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd} | ||
30 | RAID_DISK_STATE=$(cat "${RAID_DISK}/state") | ||
31 | |||
32 | DISK_SET="" | ||
33 | # Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b | ||
34 | if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then | ||
35 | NEAR_COPIES=$((MD_LAYOUT & 0xff)) | ||
36 | FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff)) | ||
37 | COPIES=$((NEAR_COPIES * FAR_COPIES)) | ||
38 | |||
39 | if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then | ||
40 | DISK_SET=$((RAID_DISK_INDEX % COPIES)) | ||
41 | fi | ||
42 | fi | ||
43 | |||
44 | echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\"" | ||
45 | if [[ -n ${DISK_SET} ]]; then | ||
46 | SET_LETTERS=({A..Z}) | ||
47 | echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\"" | ||
48 | fi | ||
49 | echo "} 1" | ||
50 | done | ||
51 | |||
52 | # Output RAID array metrics | ||
53 | # NOTE: Metadata version is a label rather than a separate metric because the version can be a string | ||
54 | echo "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} 1" | ||
55 | ) | ||
56 | done | ||
diff --git a/text_collector_examples/md_info_detail.sh b/text_collector_examples/md_info_detail.sh deleted file mode 100755 index 9806ebb..0000000 --- a/text_collector_examples/md_info_detail.sh +++ /dev/null | |||
@@ -1,87 +0,0 @@ | |||
1 | #!/usr/bin/env bash | ||
2 | # Note: This script uses "mdadm --detail" to get some of the metrics, so it must be run as root. | ||
3 | # It is designed to be run periodically in a cronjob, and output to /var/lib/node_exporter/textfile_collector/md_info_detail.prom | ||
4 | # $ cat /etc/cron.d/prometheus_md_info_detail | ||
5 | # * * * * * bash /var/lib/node_exporter/md_info_detail.sh > /var/lib/node_exporter/md_info_detail.prom.$$ && mv /var/lib/node_exporter/md_info_detail.prom.$$ /var/lib/node_exporter/md_info_detail.prom | ||
6 | |||
7 | set -eu | ||
8 | |||
9 | for MD_DEVICE in /dev/md/*; do | ||
10 | # Subshell to avoid eval'd variables from leaking between iterations | ||
11 | ( | ||
12 | # Resolve symlink to discover device, e.g. /dev/md127 | ||
13 | MD_DEVICE_NUM=$(readlink -f "${MD_DEVICE}") | ||
14 | |||
15 | # Remove /dev/ prefix | ||
16 | MD_DEVICE_NUM=${MD_DEVICE_NUM#/dev/} | ||
17 | MD_DEVICE=${MD_DEVICE#/dev/md/} | ||
18 | |||
19 | # Query sysfs for info about md device | ||
20 | SYSFS_BASE="/sys/devices/virtual/block/${MD_DEVICE_NUM}/md" | ||
21 | MD_LAYOUT=$(cat "${SYSFS_BASE}/layout") | ||
22 | MD_LEVEL=$(cat "${SYSFS_BASE}/level") | ||
23 | MD_METADATA_VERSION=$(cat "${SYSFS_BASE}/metadata_version") | ||
24 | MD_NUM_RAID_DISKS=$(cat "${SYSFS_BASE}/raid_disks") | ||
25 | |||
26 | # Remove 'raid' prefix from RAID level | ||
27 | MD_LEVEL=${MD_LEVEL#raid} | ||
28 | |||
29 | # Output disk metrics | ||
30 | for RAID_DISK in ${SYSFS_BASE}/rd[0-9]*; do | ||
31 | DISK=$(readlink -f "${RAID_DISK}/block") | ||
32 | DISK_DEVICE=$(basename "${DISK}") | ||
33 | RAID_DISK_DEVICE=$(basename "${RAID_DISK}") | ||
34 | RAID_DISK_INDEX=${RAID_DISK_DEVICE#rd} | ||
35 | RAID_DISK_STATE=$(cat "${RAID_DISK}/state") | ||
36 | |||
37 | DISK_SET="" | ||
38 | # Determine disk set using logic from mdadm: https://github.com/neilbrown/mdadm/commit/2c096ebe4b | ||
39 | if [[ ${RAID_DISK_STATE} == "in_sync" && ${MD_LEVEL} == 10 && $((MD_LAYOUT & ~0x1ffff)) ]]; then | ||
40 | NEAR_COPIES=$((MD_LAYOUT & 0xff)) | ||
41 | FAR_COPIES=$(((MD_LAYOUT >> 8) & 0xff)) | ||
42 | COPIES=$((NEAR_COPIES * FAR_COPIES)) | ||
43 | |||
44 | if [[ $((MD_NUM_RAID_DISKS % COPIES == 0)) && $((COPIES <= 26)) ]]; then | ||
45 | DISK_SET=$((RAID_DISK_INDEX % COPIES)) | ||
46 | fi | ||
47 | fi | ||
48 | |||
49 | echo -n "node_md_disk_info{disk_device=\"${DISK_DEVICE}\", md_device=\"${MD_DEVICE_NUM}\"" | ||
50 | if [[ -n ${DISK_SET} ]]; then | ||
51 | SET_LETTERS=({A..Z}) | ||
52 | echo -n ", md_set=\"${SET_LETTERS[${DISK_SET}]}\"" | ||
53 | fi | ||
54 | echo "} 1" | ||
55 | done | ||
56 | |||
57 | # Get output from mdadm --detail (Note: root/sudo required) | ||
58 | MDADM_DETAIL_OUTPUT=$(mdadm --detail /dev/"${MD_DEVICE_NUM}") | ||
59 | |||
60 | # Output RAID "Devices", "Size" and "Event" metrics, from the output of "mdadm --detail" | ||
61 | while IFS= read -r line ; do | ||
62 | # Filter out these keys that have numeric values that increment up | ||
63 | if echo "$line" | grep -E -q "Devices :|Array Size :| Used Dev Size :|Events :"; then | ||
64 | MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-') | ||
65 | MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2 | cut -d " " -f 2 | sed 's:^ ::') | ||
66 | echo "node_md_info_${MDADM_DETAIL_KEY}{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\"} ${MDADM_DETAIL_VALUE}" | ||
67 | fi | ||
68 | done <<< "$MDADM_DETAIL_OUTPUT" | ||
69 | |||
70 | # Output RAID detail metrics info from the output of "mdadm --detail" | ||
71 | # NOTE: Sending this info as labels rather than separate metrics, because some of them can be strings. | ||
72 | echo -n "node_md_info{md_device=\"${MD_DEVICE_NUM}\", md_name=\"${MD_DEVICE}\", raid_level=\"${MD_LEVEL}\", md_num_raid_disks=\"${MD_NUM_RAID_DISKS}\", md_metadata_version=\"${MD_METADATA_VERSION}\"" | ||
73 | while IFS= read -r line ; do | ||
74 | # Filter for lines with a ":", to use for Key/Value pairs in labels | ||
75 | if echo "$line" | grep -E -q ":" ; then | ||
76 | # Exclude lines with these keys, as they're values are numbers that increment up and captured in individual metrics above | ||
77 | if echo "$line" | grep -E -qv "Array Size|Used Dev Size|Events|Update Time" ; then | ||
78 | echo -n ", " | ||
79 | MDADM_DETAIL_KEY=$(echo "$line" | cut -d ":" -f 1 | tr -cd '[a-zA-Z0-9]._-') | ||
80 | MDADM_DETAIL_VALUE=$(echo "$line" | cut -d ":" -f 2- | sed 's:^ ::') | ||
81 | echo -n "${MDADM_DETAIL_KEY}=\"${MDADM_DETAIL_VALUE}\"" | ||
82 | fi | ||
83 | fi | ||
84 | done <<< "$MDADM_DETAIL_OUTPUT" | ||
85 | echo "} 1" | ||
86 | ) | ||
87 | done | ||
diff --git a/text_collector_examples/mellanox_hca_temp b/text_collector_examples/mellanox_hca_temp deleted file mode 100755 index 0a9e2b0..0000000 --- a/text_collector_examples/mellanox_hca_temp +++ /dev/null | |||
@@ -1,59 +0,0 @@ | |||
1 | #!/bin/bash | ||
2 | set -eu | ||
3 | |||
4 | # Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool | ||
5 | |||
6 | # Copyright 2018 The Prometheus Authors | ||
7 | # | ||
8 | # Licensed under the Apache License, Version 2.0 (the "License"); | ||
9 | # you may not use this file except in compliance with the License. | ||
10 | # You may obtain a copy of the License at | ||
11 | # | ||
12 | # http://www.apache.org/licenses/LICENSE-2.0 | ||
13 | # | ||
14 | # Unless required by applicable law or agreed to in writing, software | ||
15 | # distributed under the License is distributed on an "AS IS" BASIS, | ||
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
17 | # See the License for the specific language governing permissions and | ||
18 | # limitations under the License. | ||
19 | # | ||
20 | # Author: Jan Phillip Greimann <jan.greimann@cloud.ionos.com> | ||
21 | |||
22 | # check if root | ||
23 | if [ "$EUID" -ne 0 ]; then | ||
24 | echo "${0##*/}: Please run as root!" >&2 | ||
25 | exit 1 | ||
26 | fi | ||
27 | |||
28 | # check if programs are installed | ||
29 | if ! command -v mget_temp_ext >/dev/null 2>&1; then | ||
30 | echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2 | ||
31 | exit 1 | ||
32 | fi | ||
33 | |||
34 | cat <<EOF | ||
35 | # HELP node_infiniband_hca_temp_celsius Celsius temperature of Mellanox InfiniBand HCA. | ||
36 | # TYPE node_infiniband_hca_temp_celsius gauge | ||
37 | EOF | ||
38 | |||
39 | # run for each found Mellanox device | ||
40 | for dev in /sys/class/infiniband/*; do | ||
41 | if test ! -d "$dev"; then | ||
42 | continue | ||
43 | fi | ||
44 | device="${dev##*/}" | ||
45 | |||
46 | # get temperature | ||
47 | if temperature="$(mget_temp_ext -d "${device}")"; then | ||
48 | # output | ||
49 | echo "node_infiniband_hca_temp_celsius{hca_device=\"${device}\"} ${temperature//[[:space:]]/}" | ||
50 | else | ||
51 | echo "${0##*/}: Failed to get temperature from InfiniBand HCA '${device}'!" >&2 | ||
52 | fi | ||
53 | done | ||
54 | |||
55 | # if device is empty, no device was found | ||
56 | if [ -z "${device-}" ]; then | ||
57 | echo "${0##*/}: No InfiniBand HCA device found!" >&2 | ||
58 | exit 1 | ||
59 | fi | ||
diff --git a/text_collector_examples/multipathd_info b/text_collector_examples/multipathd_info deleted file mode 100755 index cddbb2b..0000000 --- a/text_collector_examples/multipathd_info +++ /dev/null | |||
@@ -1,9 +0,0 @@ | |||
1 | #!/bin/sh | ||
2 | # | ||
3 | # Description: Expose device mapper multipathing metrics from multipathd. | ||
4 | # | ||
5 | # Author: Saket Sinha <saket.sinha@cloud.ionos.com> | ||
6 | |||
7 | echo '# HELP node_dmpath_info State info for dev-mapper path' | ||
8 | echo '# TYPE node_dmpath_info gauge' | ||
9 | /sbin/multipathd show paths format '%d %t %T' | /usr/bin/awk '{ if ( NR > 1) {print "node_dmpath_info{device=\""$1"\"," "dm_path_state=\""$2"\"," "path_state=\""$3"\"}" " 1"}}' | ||
diff --git a/text_collector_examples/ntpd_metrics.py b/text_collector_examples/ntpd_metrics.py deleted file mode 100755 index ab55a13..0000000 --- a/text_collector_examples/ntpd_metrics.py +++ /dev/null | |||
@@ -1,122 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | # | ||
3 | # Description: Extract NTPd metrics from ntpq -np. | ||
4 | # Author: Ben Kochie <superq@gmail.com> | ||
5 | |||
6 | import re | ||
7 | import subprocess | ||
8 | import sys | ||
9 | |||
10 | # NTP peers status, with no DNS lookups. | ||
11 | ntpq_cmd = ['ntpq', '-np'] | ||
12 | ntpq_rv_cmd = ['ntpq', '-c', 'rv 0 offset,sys_jitter,rootdisp,rootdelay'] | ||
13 | |||
14 | # Regex to match all of the fields in the output of ntpq -np | ||
15 | metrics_fields = [ | ||
16 | '^(?P<status>.)(?P<remote>[\w\.]+)', | ||
17 | '(?P<refid>[\w\.]+)', | ||
18 | '(?P<stratum>\d+)', | ||
19 | '(?P<type>\w)', | ||
20 | '(?P<when>\d+)', | ||
21 | '(?P<poll>\d+)', | ||
22 | '(?P<reach>\d+)', | ||
23 | '(?P<delay>\d+\.\d+)', | ||
24 | '(?P<offset>-?\d+\.\d+)', | ||
25 | '(?P<jitter>\d+\.\d+)', | ||
26 | ] | ||
27 | metrics_re = '\s+'.join(metrics_fields) | ||
28 | |||
29 | # Remote types | ||
30 | # http://support.ntp.org/bin/view/Support/TroubleshootingNTP | ||
31 | remote_types = { | ||
32 | 'l': 'local', | ||
33 | 'u': 'unicast', | ||
34 | 'm': 'multicast', | ||
35 | 'b': 'broadcast', | ||
36 | '-': 'netaddr', | ||
37 | } | ||
38 | |||
39 | # Status codes: | ||
40 | # http://www.eecis.udel.edu/~mills/ntp/html/decode.html#peer | ||
41 | status_types = { | ||
42 | ' ': 0, | ||
43 | 'x': 1, | ||
44 | '.': 2, | ||
45 | '-': 3, | ||
46 | '+': 4, | ||
47 | '#': 5, | ||
48 | '*': 6, | ||
49 | 'o': 7, | ||
50 | } | ||
51 | |||
52 | |||
53 | # Run the ntpq command. | ||
54 | def get_output(command): | ||
55 | try: | ||
56 | output = subprocess.check_output(command, stderr=subprocess.DEVNULL) | ||
57 | except subprocess.CalledProcessError as e: | ||
58 | return None | ||
59 | return output.decode() | ||
60 | |||
61 | |||
62 | # Print metrics in Prometheus format. | ||
63 | def print_prometheus(metric, values): | ||
64 | print("# HELP ntpd_%s NTPd metric for %s" % (metric, metric)) | ||
65 | print("# TYPE ntpd_%s gauge" % (metric)) | ||
66 | for labels in values: | ||
67 | if labels is None: | ||
68 | print("ntpd_%s %f" % (metric, values[labels])) | ||
69 | else: | ||
70 | print("ntpd_%s{%s} %f" % (metric, labels, values[labels])) | ||
71 | |||
72 | |||
73 | # Parse raw ntpq lines. | ||
74 | def parse_line(line): | ||
75 | if re.match('\s+remote\s+refid', line): | ||
76 | return None | ||
77 | if re.match('=+', line): | ||
78 | return None | ||
79 | if re.match('.+\.(LOCL|POOL)\.', line): | ||
80 | return None | ||
81 | if re.match('^$', line): | ||
82 | return None | ||
83 | return re.match(metrics_re, line) | ||
84 | |||
85 | |||
86 | # Main function | ||
87 | def main(argv): | ||
88 | ntpq = get_output(ntpq_cmd) | ||
89 | peer_status_metrics = {} | ||
90 | delay_metrics = {} | ||
91 | offset_metrics = {} | ||
92 | jitter_metrics = {} | ||
93 | for line in ntpq.split('\n'): | ||
94 | metric_match = parse_line(line) | ||
95 | if metric_match is None: | ||
96 | continue | ||
97 | remote = metric_match.group('remote') | ||
98 | refid = metric_match.group('refid') | ||
99 | stratum = metric_match.group('stratum') | ||
100 | remote_type = remote_types[metric_match.group('type')] | ||
101 | common_labels = "remote=\"%s\",reference=\"%s\"" % (remote, refid) | ||
102 | peer_labels = "%s,stratum=\"%s\",type=\"%s\"" % (common_labels, stratum, remote_type) | ||
103 | |||
104 | peer_status_metrics[peer_labels] = float(status_types[metric_match.group('status')]) | ||
105 | delay_metrics[common_labels] = float(metric_match.group('delay')) | ||
106 | offset_metrics[common_labels] = float(metric_match.group('offset')) | ||
107 | jitter_metrics[common_labels] = float(metric_match.group('jitter')) | ||
108 | |||
109 | print_prometheus('peer_status', peer_status_metrics) | ||
110 | print_prometheus('delay_milliseconds', delay_metrics) | ||
111 | print_prometheus('offset_milliseconds', offset_metrics) | ||
112 | print_prometheus('jitter_milliseconds', jitter_metrics) | ||
113 | |||
114 | ntpq_rv = get_output(ntpq_rv_cmd) | ||
115 | for metric in ntpq_rv.split(','): | ||
116 | metric_name, metric_value = metric.strip().split('=') | ||
117 | print_prometheus(metric_name, {None: float(metric_value)}) | ||
118 | |||
119 | |||
120 | # Go go go! | ||
121 | if __name__ == "__main__": | ||
122 | main(sys.argv[1:]) | ||
diff --git a/text_collector_examples/nvme_metrics.sh b/text_collector_examples/nvme_metrics.sh deleted file mode 100755 index 5cc23cf..0000000 --- a/text_collector_examples/nvme_metrics.sh +++ /dev/null | |||
@@ -1,97 +0,0 @@ | |||
1 | #!/usr/bin/env bash | ||
2 | set -eu | ||
3 | |||
4 | # Dependencies: nvme-cli, jq (packages) | ||
5 | # Based on code from | ||
6 | # - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh | ||
7 | # - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp | ||
8 | # - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh | ||
9 | # | ||
10 | # Author: Henk <henk@wearespindle.com> | ||
11 | |||
12 | # Check if we are root | ||
13 | if [ "$EUID" -ne 0 ]; then | ||
14 | echo "${0##*/}: Please run as root!" >&2 | ||
15 | exit 1 | ||
16 | fi | ||
17 | |||
18 | # Check if programs are installed | ||
19 | if ! command -v nvme >/dev/null 2>&1; then | ||
20 | echo "${0##*/}: nvme is not installed. Aborting." >&2 | ||
21 | exit 1 | ||
22 | fi | ||
23 | |||
24 | output_format_awk="$( | ||
25 | cat <<'OUTPUTAWK' | ||
26 | BEGIN { v = "" } | ||
27 | v != $1 { | ||
28 | print "# HELP nvme_" $1 " SMART metric " $1; | ||
29 | if ($1 ~ /_total$/) | ||
30 | print "# TYPE nvme_" $1 " counter"; | ||
31 | else | ||
32 | print "# TYPE nvme_" $1 " gauge"; | ||
33 | v = $1 | ||
34 | } | ||
35 | {print "nvme_" $0} | ||
36 | OUTPUTAWK | ||
37 | )" | ||
38 | |||
39 | format_output() { | ||
40 | sort | awk -F'{' "${output_format_awk}" | ||
41 | } | ||
42 | |||
43 | # Get the nvme-cli version | ||
44 | nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" | ||
45 | echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output | ||
46 | |||
47 | # Get devices | ||
48 | device_list="$(nvme list | awk '/^\/dev/{print $1}')" | ||
49 | |||
50 | # Loop through the NVMe devices | ||
51 | for device in ${device_list}; do | ||
52 | json_check="$(nvme smart-log -o json "${device}")" | ||
53 | disk="$(echo "${device}" | cut -c6-10)" | ||
54 | |||
55 | # The temperature value in JSON is in Kelvin, we want Celsius | ||
56 | value_temperature="$(echo "$json_check" | jq '.temperature - 273')" | ||
57 | echo "temperature_celcius{device=\"${disk}\"} ${value_temperature}" | ||
58 | |||
59 | value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" | ||
60 | echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" | ||
61 | |||
62 | value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" | ||
63 | echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" | ||
64 | |||
65 | value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" | ||
66 | echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" | ||
67 | |||
68 | value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" | ||
69 | echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" | ||
70 | |||
71 | value_media_errors="$(echo "$json_check" | jq '.media_errors')" | ||
72 | echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" | ||
73 | |||
74 | value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" | ||
75 | echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" | ||
76 | |||
77 | value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" | ||
78 | echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" | ||
79 | |||
80 | value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" | ||
81 | echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" | ||
82 | |||
83 | value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" | ||
84 | echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" | ||
85 | |||
86 | value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" | ||
87 | echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" | ||
88 | |||
89 | value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" | ||
90 | echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" | ||
91 | |||
92 | value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" | ||
93 | echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" | ||
94 | |||
95 | value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" | ||
96 | echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" | ||
97 | done | format_output | ||
diff --git a/text_collector_examples/pacman.sh b/text_collector_examples/pacman.sh deleted file mode 100755 index 82ac4cf..0000000 --- a/text_collector_examples/pacman.sh +++ /dev/null | |||
@@ -1,33 +0,0 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # | ||
4 | # Description: Expose metrics from pacman updates | ||
5 | # If installed The bash script *checkupdates*, included with the | ||
6 | # *pacman-contrib* package, is used to calculate the number of pending updates. | ||
7 | # Otherwise *pacman* is used for calculation. | ||
8 | # | ||
9 | # Author: Sven Haardiek <sven@haardiek.de> | ||
10 | |||
11 | set -o errexit | ||
12 | set -o nounset | ||
13 | set -o pipefail | ||
14 | |||
15 | if [ -x /usr/bin/checkupdates ] | ||
16 | then | ||
17 | updates=$(/usr/bin/checkupdates | wc -l) | ||
18 | cache=0 | ||
19 | else | ||
20 | if ! updates=$(/usr/bin/pacman -Qu | wc -l) | ||
21 | then | ||
22 | updates=0 | ||
23 | fi | ||
24 | cache=1 | ||
25 | fi | ||
26 | |||
27 | echo "# HELP updates_pending number of pending updates from pacman" | ||
28 | echo "# TYPE updates_pending gauge" | ||
29 | echo "pacman_updates_pending $updates" | ||
30 | |||
31 | echo "# HELP pacman_updates_pending_from_cache pending updates information are from cache" | ||
32 | echo "# TYPE pacman_updates_pending_from_cache gauge" | ||
33 | echo "pacman_updates_pending_from_cache $cache" | ||
diff --git a/text_collector_examples/smartmon.py b/text_collector_examples/smartmon.py deleted file mode 100755 index 7dbf26e..0000000 --- a/text_collector_examples/smartmon.py +++ /dev/null | |||
@@ -1,378 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | import argparse | ||
3 | import collections | ||
4 | import csv | ||
5 | import datetime | ||
6 | import decimal | ||
7 | import re | ||
8 | import shlex | ||
9 | import subprocess | ||
10 | |||
11 | device_info_re = re.compile(r'^(?P<k>[^:]+?)(?:(?:\sis|):)\s*(?P<v>.*)$') | ||
12 | |||
13 | ata_error_count_re = re.compile( | ||
14 | r'^Error (\d+) \[\d+\] occurred', re.MULTILINE) | ||
15 | |||
16 | self_test_re = re.compile(r'^SMART.*(PASSED|OK)$', re.MULTILINE) | ||
17 | |||
18 | device_info_map = { | ||
19 | 'Vendor': 'vendor', | ||
20 | 'Product': 'product', | ||
21 | 'Revision': 'revision', | ||
22 | 'Logical Unit id': 'lun_id', | ||
23 | 'Model Family': 'model_family', | ||
24 | 'Device Model': 'device_model', | ||
25 | 'Serial Number': 'serial_number', | ||
26 | 'Firmware Version': 'firmware_version', | ||
27 | } | ||
28 | |||
29 | smart_attributes_whitelist = { | ||
30 | 'airflow_temperature_cel', | ||
31 | 'command_timeout', | ||
32 | 'current_pending_sector', | ||
33 | 'end_to_end_error', | ||
34 | 'erase_fail_count_total', | ||
35 | 'g_sense_error_rate', | ||
36 | 'hardware_ecc_recovered', | ||
37 | 'host_reads_mib', | ||
38 | 'host_reads_32mib', | ||
39 | 'host_writes_mib', | ||
40 | 'host_writes_32mib', | ||
41 | 'load_cycle_count', | ||
42 | 'media_wearout_indicator', | ||
43 | 'wear_leveling_count', | ||
44 | 'nand_writes_1gib', | ||
45 | 'offline_uncorrectable', | ||
46 | 'power_cycle_count', | ||
47 | 'power_on_hours', | ||
48 | 'program_fail_count', | ||
49 | 'raw_read_error_rate', | ||
50 | 'reallocated_event_count', | ||
51 | 'reallocated_sector_ct', | ||
52 | 'reported_uncorrect', | ||
53 | 'sata_downshift_count', | ||
54 | 'seek_error_rate', | ||
55 | 'spin_retry_count', | ||
56 | 'spin_up_time', | ||
57 | 'start_stop_count', | ||
58 | 'temperature_case', | ||
59 | 'temperature_celsius', | ||
60 | 'temperature_internal', | ||
61 | 'total_lbas_read', | ||
62 | 'total_lbas_written', | ||
63 | 'udma_crc_error_count', | ||
64 | 'unsafe_shutdown_count', | ||
65 | 'workld_host_reads_perc', | ||
66 | 'workld_media_wear_indic', | ||
67 | 'workload_minutes', | ||
68 | } | ||
69 | |||
70 | Metric = collections.namedtuple('Metric', 'name labels value') | ||
71 | |||
72 | SmartAttribute = collections.namedtuple('SmartAttribute', [ | ||
73 | 'id', 'name', 'flag', 'value', 'worst', 'threshold', 'type', 'updated', | ||
74 | 'when_failed', 'raw_value', | ||
75 | ]) | ||
76 | |||
77 | |||
78 | class Device(collections.namedtuple('DeviceBase', 'path opts')): | ||
79 | """Representation of a device as found by smartctl --scan output.""" | ||
80 | |||
81 | @property | ||
82 | def type(self): | ||
83 | return self.opts.type | ||
84 | |||
85 | @property | ||
86 | def base_labels(self): | ||
87 | return {'disk': self.path} | ||
88 | |||
89 | def smartctl_select(self): | ||
90 | return ['--device', self.type, self.path] | ||
91 | |||
92 | |||
93 | def metric_key(metric, prefix=''): | ||
94 | return '{prefix}{metric.name}'.format(prefix=prefix, metric=metric) | ||
95 | |||
96 | |||
97 | def metric_format(metric, prefix=''): | ||
98 | key = metric_key(metric, prefix) | ||
99 | labels = ','.join( | ||
100 | '{k}="{v}"'.format(k=k, v=v) for k, v in metric.labels.items()) | ||
101 | value = decimal.Decimal(metric.value) | ||
102 | |||
103 | return '{key}{{{labels}}} {value}'.format( | ||
104 | key=key, labels=labels, value=value) | ||
105 | |||
106 | |||
107 | def metric_print_meta(metric, prefix=''): | ||
108 | key = metric_key(metric, prefix) | ||
109 | print('# HELP {key} SMART metric {metric.name}'.format( | ||
110 | key=key, metric=metric)) | ||
111 | print('# TYPE {key} gauge'.format(key=key, metric=metric)) | ||
112 | |||
113 | |||
114 | def metric_print(metric, prefix=''): | ||
115 | print(metric_format(metric, prefix)) | ||
116 | |||
117 | |||
118 | def smart_ctl(*args, check=True): | ||
119 | """Wrapper around invoking the smartctl binary. | ||
120 | |||
121 | Returns: | ||
122 | (str) Data piped to stdout by the smartctl subprocess. | ||
123 | """ | ||
124 | try: | ||
125 | return subprocess.run( | ||
126 | ['smartctl', *args], stdout=subprocess.PIPE, check=check | ||
127 | ).stdout.decode('utf-8') | ||
128 | except subprocess.CalledProcessError as e: | ||
129 | return e.output.decode('utf-8') | ||
130 | |||
131 | def smart_ctl_version(): | ||
132 | return smart_ctl('-V').split('\n')[0].split()[1] | ||
133 | |||
134 | |||
135 | def find_devices(): | ||
136 | """Find SMART devices. | ||
137 | |||
138 | Yields: | ||
139 | (Device) Single device found by smartctl. | ||
140 | """ | ||
141 | parser = argparse.ArgumentParser() | ||
142 | parser.add_argument('-d', '--device', dest='type') | ||
143 | |||
144 | devices = smart_ctl('--scan-open') | ||
145 | |||
146 | for device in devices.split('\n'): | ||
147 | device = device.strip() | ||
148 | if not device: | ||
149 | continue | ||
150 | |||
151 | tokens = shlex.split(device, comments=True) | ||
152 | if not tokens: | ||
153 | continue | ||
154 | |||
155 | yield Device(tokens[0], parser.parse_args(tokens[1:])) | ||
156 | |||
157 | |||
158 | def device_is_active(device): | ||
159 | """Returns whenever the given device is currently active or not. | ||
160 | |||
161 | Args: | ||
162 | device: (Device) Device in question. | ||
163 | |||
164 | Returns: | ||
165 | (bool) True if the device is active and False otherwise. | ||
166 | """ | ||
167 | try: | ||
168 | smart_ctl('--nocheck', 'standby', *device.smartctl_select()) | ||
169 | except subprocess.CalledProcessError: | ||
170 | return False | ||
171 | |||
172 | return True | ||
173 | |||
174 | |||
175 | def device_info(device): | ||
176 | """Query device for basic model information. | ||
177 | |||
178 | Args: | ||
179 | device: (Device) Device in question. | ||
180 | |||
181 | Returns: | ||
182 | (generator): Generator yielding: | ||
183 | |||
184 | key (str): Key describing the value. | ||
185 | value (str): Actual value. | ||
186 | """ | ||
187 | info_lines = smart_ctl( | ||
188 | '--info', *device.smartctl_select() | ||
189 | ).strip().split('\n')[3:] | ||
190 | |||
191 | matches = (device_info_re.match(l) for l in info_lines) | ||
192 | return (m.groups() for m in matches if m is not None) | ||
193 | |||
194 | |||
195 | def device_smart_capabilities(device): | ||
196 | """Returns SMART capabilities of the given device. | ||
197 | |||
198 | Args: | ||
199 | device: (Device) Device in question. | ||
200 | |||
201 | Returns: | ||
202 | (tuple): tuple containing: | ||
203 | |||
204 | (bool): True whenever SMART is available, False otherwise. | ||
205 | (bool): True whenever SMART is enabled, False otherwise. | ||
206 | """ | ||
207 | groups = device_info(device) | ||
208 | |||
209 | state = { | ||
210 | g[1].split(' ', 1)[0] | ||
211 | for g in groups if g[0] == 'SMART support'} | ||
212 | |||
213 | smart_available = 'Available' in state | ||
214 | smart_enabled = 'Enabled' in state | ||
215 | |||
216 | return smart_available, smart_enabled | ||
217 | |||
218 | |||
219 | def collect_device_info(device): | ||
220 | """Collect basic device information. | ||
221 | |||
222 | Args: | ||
223 | device: (Device) Device in question. | ||
224 | |||
225 | Yields: | ||
226 | (Metric) metrics describing general device information. | ||
227 | """ | ||
228 | values = dict(device_info(device)) | ||
229 | yield Metric('device_info', { | ||
230 | **device.base_labels, | ||
231 | **{v: values[k] for k, v in device_info_map.items() if k in values} | ||
232 | }, True) | ||
233 | |||
234 | |||
235 | def collect_device_health_self_assessment(device): | ||
236 | """Collect metric about the device health self assessment. | ||
237 | |||
238 | Args: | ||
239 | device: (Device) Device in question. | ||
240 | |||
241 | Yields: | ||
242 | (Metric) Device health self assessment. | ||
243 | """ | ||
244 | out = smart_ctl('--health', *device.smartctl_select()) | ||
245 | |||
246 | if self_test_re.search(out): | ||
247 | self_assessment_passed = True | ||
248 | else: | ||
249 | self_assessment_passed = False | ||
250 | |||
251 | yield Metric( | ||
252 | 'device_smart_healthy', device.base_labels, self_assessment_passed) | ||
253 | |||
254 | |||
255 | def collect_ata_metrics(device): | ||
256 | # Fetch SMART attributes for the given device. | ||
257 | attributes = smart_ctl( | ||
258 | '--attributes', *device.smartctl_select() | ||
259 | ) | ||
260 | |||
261 | # replace multiple occurrences of whitespace with a single whitespace | ||
262 | # so that the CSV Parser recognizes individual columns properly. | ||
263 | attributes = re.sub(r'[\t\x20]+', ' ', attributes) | ||
264 | |||
265 | # Turn smartctl output into a list of lines and skip to the table of | ||
266 | # SMART attributes. | ||
267 | attribute_lines = attributes.strip().split('\n')[7:] | ||
268 | |||
269 | reader = csv.DictReader( | ||
270 | (l.strip() for l in attribute_lines), | ||
271 | fieldnames=SmartAttribute._fields[:-1], | ||
272 | restkey=SmartAttribute._fields[-1], delimiter=' ') | ||
273 | for entry in reader: | ||
274 | # We're only interested in the SMART attributes that are | ||
275 | # whitelisted here. | ||
276 | entry['name'] = entry['name'].lower() | ||
277 | if entry['name'] not in smart_attributes_whitelist: | ||
278 | continue | ||
279 | |||
280 | # Ensure that only the numeric parts are fetched from the raw_value. | ||
281 | # Attributes such as 194 Temperature_Celsius reported by my SSD | ||
282 | # are in the format of "36 (Min/Max 24/40)" which can't be expressed | ||
283 | # properly as a prometheus metric. | ||
284 | m = re.match('^(\d+)', ' '.join(entry['raw_value'])) | ||
285 | if not m: | ||
286 | continue | ||
287 | entry['raw_value'] = m.group(1) | ||
288 | |||
289 | if entry['name'] in smart_attributes_whitelist: | ||
290 | labels = { | ||
291 | 'name': entry['name'], | ||
292 | **device.base_labels, | ||
293 | } | ||
294 | |||
295 | for col in 'value', 'worst', 'threshold': | ||
296 | yield Metric( | ||
297 | 'attr_{col}'.format(name=entry["name"], col=col), | ||
298 | labels, entry[col]) | ||
299 | |||
300 | |||
301 | def collect_ata_error_count(device): | ||
302 | """Inspect the device error log and report the amount of entries. | ||
303 | |||
304 | Args: | ||
305 | device: (Device) Device in question. | ||
306 | |||
307 | Yields: | ||
308 | (Metric) Device error count. | ||
309 | """ | ||
310 | error_log = smart_ctl( | ||
311 | '-l', 'xerror,1', *device.smartctl_select(), check=False) | ||
312 | |||
313 | m = ata_error_count_re.search(error_log) | ||
314 | |||
315 | error_count = m.group(1) if m is not None else 0 | ||
316 | |||
317 | yield Metric('device_errors', device.base_labels, error_count) | ||
318 | |||
319 | |||
320 | def collect_disks_smart_metrics(): | ||
321 | now = int(datetime.datetime.utcnow().timestamp()) | ||
322 | |||
323 | for device in find_devices(): | ||
324 | yield Metric('smartctl_run', device.base_labels, now) | ||
325 | |||
326 | is_active = device_is_active(device) | ||
327 | |||
328 | yield Metric('device_active', device.base_labels, is_active) | ||
329 | |||
330 | # Skip further metrics collection to prevent the disk from | ||
331 | # spinning up. | ||
332 | if not is_active: | ||
333 | continue | ||
334 | |||
335 | yield from collect_device_info(device) | ||
336 | |||
337 | smart_available, smart_enabled = device_smart_capabilities(device) | ||
338 | |||
339 | yield Metric( | ||
340 | 'device_smart_available', device.base_labels, smart_available) | ||
341 | yield Metric( | ||
342 | 'device_smart_enabled', device.base_labels, smart_enabled) | ||
343 | |||
344 | # Skip further metrics collection here if SMART is disabled | ||
345 | # on the device. Further smartctl invocations would fail | ||
346 | # anyways. | ||
347 | if not smart_available: | ||
348 | continue | ||
349 | |||
350 | yield from collect_device_health_self_assessment(device) | ||
351 | |||
352 | if device.type.startswith('sat'): | ||
353 | yield from collect_ata_metrics(device) | ||
354 | |||
355 | yield from collect_ata_error_count(device) | ||
356 | |||
357 | |||
358 | def main(): | ||
359 | version_metric = Metric('smartctl_version', { | ||
360 | 'version': smart_ctl_version() | ||
361 | }, True) | ||
362 | metric_print_meta(version_metric, 'smartmon_') | ||
363 | metric_print(version_metric, 'smartmon_') | ||
364 | |||
365 | metrics = list(collect_disks_smart_metrics()) | ||
366 | metrics.sort(key=lambda i: i.name) | ||
367 | |||
368 | previous_name = None | ||
369 | for m in metrics: | ||
370 | if m.name != previous_name: | ||
371 | metric_print_meta(m, 'smartmon_') | ||
372 | |||
373 | previous_name = m.name | ||
374 | |||
375 | metric_print(m, 'smartmon_') | ||
376 | |||
377 | if __name__ == '__main__': | ||
378 | main() | ||
diff --git a/text_collector_examples/smartmon.sh b/text_collector_examples/smartmon.sh deleted file mode 100755 index 8a75d29..0000000 --- a/text_collector_examples/smartmon.sh +++ /dev/null | |||
@@ -1,194 +0,0 @@ | |||
1 | #!/bin/bash | ||
2 | # Script informed by the collectd monitoring script for smartmontools (using smartctl) | ||
3 | # by Samuel B. <samuel_._behan_(at)_dob_._sk> (c) 2012 | ||
4 | # source at: http://devel.dob.sk/collectd-scripts/ | ||
5 | |||
6 | # TODO: This probably needs to be a little more complex. The raw numbers can have more | ||
7 | # data in them than you'd think. | ||
8 | # http://arstechnica.com/civis/viewtopic.php?p=22062211 | ||
9 | |||
10 | # Formatting done via shfmt -i 2 | ||
11 | # https://github.com/mvdan/sh | ||
12 | |||
13 | parse_smartctl_attributes_awk="$( | ||
14 | cat <<'SMARTCTLAWK' | ||
15 | $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { | ||
16 | gsub(/-/, "_"); | ||
17 | printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 | ||
18 | printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 | ||
19 | printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 | ||
20 | printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 | ||
21 | } | ||
22 | SMARTCTLAWK | ||
23 | )" | ||
24 | |||
25 | smartmon_attrs="$( | ||
26 | cat <<'SMARTMONATTRS' | ||
27 | airflow_temperature_cel | ||
28 | command_timeout | ||
29 | current_pending_sector | ||
30 | end_to_end_error | ||
31 | erase_fail_count | ||
32 | g_sense_error_rate | ||
33 | hardware_ecc_recovered | ||
34 | host_reads_mib | ||
35 | host_reads_32mib | ||
36 | host_writes_mib | ||
37 | host_writes_32mib | ||
38 | load_cycle_count | ||
39 | media_wearout_indicator | ||
40 | wear_leveling_count | ||
41 | nand_writes_1gib | ||
42 | offline_uncorrectable | ||
43 | power_cycle_count | ||
44 | power_on_hours | ||
45 | program_fail_count | ||
46 | raw_read_error_rate | ||
47 | reallocated_event_count | ||
48 | reallocated_sector_ct | ||
49 | reported_uncorrect | ||
50 | sata_downshift_count | ||
51 | seek_error_rate | ||
52 | spin_retry_count | ||
53 | spin_up_time | ||
54 | start_stop_count | ||
55 | temperature_case | ||
56 | temperature_celsius | ||
57 | temperature_internal | ||
58 | total_lbas_read | ||
59 | total_lbas_written | ||
60 | udma_crc_error_count | ||
61 | unsafe_shutdown_count | ||
62 | workld_host_reads_perc | ||
63 | workld_media_wear_indic | ||
64 | workload_minutes | ||
65 | SMARTMONATTRS | ||
66 | )" | ||
67 | smartmon_attrs="$(echo ${smartmon_attrs} | xargs | tr ' ' '|')" | ||
68 | |||
69 | parse_smartctl_attributes() { | ||
70 | local disk="$1" | ||
71 | local disk_type="$2" | ||
72 | local labels="disk=\"${disk}\",type=\"${disk_type}\"" | ||
73 | local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" | ||
74 | sed 's/^ \+//g' | | ||
75 | awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | | ||
76 | tr A-Z a-z | | ||
77 | grep -E "(${smartmon_attrs})" | ||
78 | } | ||
79 | |||
80 | parse_smartctl_scsi_attributes() { | ||
81 | local disk="$1" | ||
82 | local disk_type="$2" | ||
83 | local labels="disk=\"${disk}\",type=\"${disk_type}\"" | ||
84 | while read line; do | ||
85 | attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" | ||
86 | attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" | ||
87 | case "${attr_type}" in | ||
88 | number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; | ||
89 | Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; | ||
90 | Blocks_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; | ||
91 | Blocks_received_from_initiator_) lbas_written="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; | ||
92 | Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; | ||
93 | Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; | ||
94 | esac | ||
95 | done | ||
96 | [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" | ||
97 | [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" | ||
98 | [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" | ||
99 | [ ! -z "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"242\"} ${lbas_written}" | ||
100 | [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" | ||
101 | [ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}" | ||
102 | } | ||
103 | |||
104 | parse_smartctl_info() { | ||
105 | local -i smart_available=0 smart_enabled=0 smart_healthy=0 | ||
106 | local disk="$1" disk_type="$2" | ||
107 | local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' | ||
108 | while read line; do | ||
109 | info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" | ||
110 | info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" | ||
111 | case "${info_type}" in | ||
112 | Model_Family) model_family="${info_value}" ;; | ||
113 | Device_Model) device_model="${info_value}" ;; | ||
114 | Serial_Number) serial_number="${info_value}" ;; | ||
115 | Firmware_Version) fw_version="${info_value}" ;; | ||
116 | Vendor) vendor="${info_value}" ;; | ||
117 | Product) product="${info_value}" ;; | ||
118 | Revision) revision="${info_value}" ;; | ||
119 | Logical_Unit_id) lun_id="${info_value}" ;; | ||
120 | esac | ||
121 | if [[ "${info_type}" == 'SMART_support_is' ]]; then | ||
122 | case "${info_value:0:7}" in | ||
123 | Enabled) smart_enabled=1 ;; | ||
124 | Availab) smart_available=1 ;; | ||
125 | Unavail) smart_available=0 ;; | ||
126 | esac | ||
127 | fi | ||
128 | if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then | ||
129 | case "${info_value:0:6}" in | ||
130 | PASSED) smart_healthy=1 ;; | ||
131 | esac | ||
132 | elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then | ||
133 | case "${info_value:0:2}" in | ||
134 | OK) smart_healthy=1 ;; | ||
135 | esac | ||
136 | fi | ||
137 | done | ||
138 | echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" | ||
139 | echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}" | ||
140 | echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}" | ||
141 | echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}" | ||
142 | } | ||
143 | |||
144 | output_format_awk="$( | ||
145 | cat <<'OUTPUTAWK' | ||
146 | BEGIN { v = "" } | ||
147 | v != $1 { | ||
148 | print "# HELP smartmon_" $1 " SMART metric " $1; | ||
149 | print "# TYPE smartmon_" $1 " gauge"; | ||
150 | v = $1 | ||
151 | } | ||
152 | {print "smartmon_" $0} | ||
153 | OUTPUTAWK | ||
154 | )" | ||
155 | |||
156 | format_output() { | ||
157 | sort | | ||
158 | awk -F'{' "${output_format_awk}" | ||
159 | } | ||
160 | |||
161 | smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" | ||
162 | |||
163 | echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output | ||
164 | |||
165 | if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then | ||
166 | exit | ||
167 | fi | ||
168 | |||
169 | device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" | ||
170 | |||
171 | for device in ${device_list}; do | ||
172 | disk="$(echo ${device} | cut -f1 -d'|')" | ||
173 | type="$(echo ${device} | cut -f2 -d'|')" | ||
174 | active=1 | ||
175 | echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" | ||
176 | # Check if the device is in a low-power mode | ||
177 | /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 | ||
178 | echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" | ||
179 | # Skip further metrics to prevent the disk from spinning up | ||
180 | test ${active} -eq 0 && continue | ||
181 | # Get the SMART information and health | ||
182 | /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" | ||
183 | # Get the SMART attributes | ||
184 | case ${type} in | ||
185 | sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; | ||
186 | sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; | ||
187 | scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; | ||
188 | megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; | ||
189 | *) | ||
190 | echo "disk type is not sat, scsi or megaraid but ${type}" | ||
191 | exit | ||
192 | ;; | ||
193 | esac | ||
194 | done | format_output | ||
diff --git a/text_collector_examples/storcli.py b/text_collector_examples/storcli.py deleted file mode 100755 index 7dc6f95..0000000 --- a/text_collector_examples/storcli.py +++ /dev/null | |||
@@ -1,242 +0,0 @@ | |||
1 | #!/usr/bin/env python3 | ||
2 | """ | ||
3 | Script to parse StorCLI's JSON output and expose | ||
4 | MegaRAID health as Prometheus metrics. | ||
5 | |||
6 | Tested against StorCLI 'Ver 1.14.12 Nov 25, 2014'. | ||
7 | |||
8 | StorCLI reference manual: | ||
9 | http://docs.avagotech.com/docs/12352476 | ||
10 | |||
11 | Advanced Software Options (ASO) not exposed as metrics currently. | ||
12 | |||
13 | JSON key abbreviations used by StorCLI are documented in the standard command | ||
14 | output, i.e. when you omit the trailing 'J' from the command. | ||
15 | |||
16 | Formatting done with YAPF: | ||
17 | $ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py | ||
18 | """ | ||
19 | |||
20 | from __future__ import print_function | ||
21 | from datetime import datetime | ||
22 | import argparse | ||
23 | import collections | ||
24 | import json | ||
25 | import os | ||
26 | import shlex | ||
27 | import subprocess | ||
28 | |||
29 | DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as | ||
30 | Prometheus metrics.""" | ||
31 | VERSION = '0.0.3' | ||
32 | |||
33 | storcli_path = '' | ||
34 | metric_prefix = 'megaraid_' | ||
35 | metric_list = {} | ||
36 | metric_list = collections.defaultdict(list) | ||
37 | |||
38 | |||
39 | def main(args): | ||
40 | """ main """ | ||
41 | global storcli_path | ||
42 | storcli_path = args.storcli_path | ||
43 | data = get_storcli_json('/cALL show all J') | ||
44 | |||
45 | try: | ||
46 | # All the information is collected underneath the Controllers key | ||
47 | data = data['Controllers'] | ||
48 | |||
49 | for controller in data: | ||
50 | response = controller['Response Data'] | ||
51 | |||
52 | handle_common_controller(response) | ||
53 | if response['Version']['Driver Name'] == 'megaraid_sas': | ||
54 | handle_megaraid_controller(response) | ||
55 | elif response['Version']['Driver Name'] == 'mpt3sas': | ||
56 | handle_sas_controller(response) | ||
57 | except KeyError: | ||
58 | pass | ||
59 | |||
60 | print_all_metrics(metric_list) | ||
61 | |||
62 | def handle_common_controller(response): | ||
63 | (controller_index, baselabel) = get_basic_controller_info(response) | ||
64 | |||
65 | # Split up string to not trigger CodeSpell issues | ||
66 | if 'ROC temperature(Degree Celc' + 'ius)' in response['HwCfg'].keys(): | ||
67 | response['HwCfg']['ROC temperature(Degree Celsius)'] = response['HwCfg'].pop('ROC temperature(Degree Celc' + 'ius)') | ||
68 | add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)'])) | ||
69 | |||
70 | def handle_sas_controller(response): | ||
71 | (controller_index, baselabel) = get_basic_controller_info(response) | ||
72 | add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK')) | ||
73 | add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) | ||
74 | try: | ||
75 | # The number of physical disks is half of the number of items in this dict | ||
76 | # Every disk is listed twice - once for basic info, again for detailed info | ||
77 | add_metric('physical_drives', baselabel, | ||
78 | len(response['Physical Device Information'].keys()) / 2) | ||
79 | except AttributeError: | ||
80 | pass | ||
81 | |||
82 | for key, basic_disk_info in response['Physical Device Information'].items(): | ||
83 | if 'Detailed Information' in key: | ||
84 | continue | ||
85 | create_metrics_of_physical_drive(basic_disk_info[0], | ||
86 | response['Physical Device Information'], controller_index) | ||
87 | |||
88 | |||
89 | def handle_megaraid_controller(response): | ||
90 | (controller_index, baselabel) = get_basic_controller_info(response) | ||
91 | |||
92 | # BBU Status Optimal value is 0 for cachevault and 32 for BBU | ||
93 | add_metric('battery_backup_healthy', baselabel, | ||
94 | int(response['Status']['BBU Status'] in [0, 32])) | ||
95 | add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) | ||
96 | add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) | ||
97 | add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) | ||
98 | add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) | ||
99 | add_metric('scheduled_patrol_read', baselabel, | ||
100 | int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) | ||
101 | for cvidx, cvinfo in enumerate(response['Cachevault_Info']): | ||
102 | add_metric('cv_temperature', baselabel + ',cvidx="' + str(cvidx) + '"', int(cvinfo['Temp'].replace('C',''))) | ||
103 | |||
104 | time_difference_seconds = -1 | ||
105 | system_time = datetime.strptime(response['Basics'].get('Current System Date/time'), | ||
106 | "%m/%d/%Y, %H:%M:%S") | ||
107 | controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'), | ||
108 | "%m/%d/%Y, %H:%M:%S") | ||
109 | if system_time and controller_time: | ||
110 | time_difference_seconds = abs(system_time - controller_time).seconds | ||
111 | add_metric('time_difference', baselabel, time_difference_seconds) | ||
112 | |||
113 | # Make sure it doesn't crash if it's a JBOD setup | ||
114 | if 'Drive Groups' in response.keys(): | ||
115 | add_metric('drive_groups', baselabel, response['Drive Groups']) | ||
116 | add_metric('virtual_drives', baselabel, response['Virtual Drives']) | ||
117 | |||
118 | for virtual_drive in response['VD LIST']: | ||
119 | vd_position = virtual_drive.get('DG/VD') | ||
120 | drive_group, volume_group = -1, -1 | ||
121 | if vd_position: | ||
122 | drive_group = vd_position.split('/')[0] | ||
123 | volume_group = vd_position.split('/')[1] | ||
124 | vd_baselabel = 'controller="{0}",DG="{1}",VG="{2}"'.format(controller_index, drive_group, | ||
125 | volume_group) | ||
126 | vd_info_label = vd_baselabel + ',name="{0}",cache="{1}",type="{2}",state="{3}"'.format( | ||
127 | str(virtual_drive.get('Name')).strip(), | ||
128 | str(virtual_drive.get('Cache')).strip(), | ||
129 | str(virtual_drive.get('TYPE')).strip(), | ||
130 | str(virtual_drive.get('State')).strip()) | ||
131 | add_metric('vd_info', vd_info_label, 1) | ||
132 | |||
133 | add_metric('physical_drives', baselabel, response['Physical Drives']) | ||
134 | if response['Physical Drives'] > 0: | ||
135 | data = get_storcli_json('/cALL/eALL/sALL show all J') | ||
136 | drive_info = data['Controllers'][controller_index]['Response Data'] | ||
137 | for physical_drive in response['PD LIST']: | ||
138 | create_metrics_of_physical_drive(physical_drive, drive_info, controller_index) | ||
139 | |||
140 | |||
141 | def get_basic_controller_info(response): | ||
142 | controller_index = response['Basics']['Controller'] | ||
143 | baselabel = 'controller="{0}"'.format(controller_index) | ||
144 | |||
145 | controller_info_label = baselabel + ',model="{0}",serial="{1}",fwversion="{2}"'.format( | ||
146 | str(response['Basics']['Model']).strip(), | ||
147 | str(response['Basics']['Serial Number']).strip(), | ||
148 | str(response['Version']['Firmware Version']).strip(), | ||
149 | ) | ||
150 | add_metric('controller_info', controller_info_label, 1) | ||
151 | |||
152 | return (controller_index, baselabel) | ||
153 | |||
154 | |||
155 | def create_metrics_of_physical_drive(physical_drive, detailed_info_array, controller_index): | ||
156 | enclosure = physical_drive.get('EID:Slt').split(':')[0] | ||
157 | slot = physical_drive.get('EID:Slt').split(':')[1] | ||
158 | |||
159 | pd_baselabel = 'controller="{0}",enclosure="{1}",slot="{2}"'.format(controller_index, enclosure, | ||
160 | slot) | ||
161 | pd_info_label = pd_baselabel + \ | ||
162 | ',disk_id="{0}",interface="{1}",media="{2}",model="{3}",DG="{4}",state="{5}"'.format( | ||
163 | str(physical_drive.get('DID')).strip(), | ||
164 | str(physical_drive.get('Intf')).strip(), | ||
165 | str(physical_drive.get('Med')).strip(), | ||
166 | str(physical_drive.get('Model')).strip(), | ||
167 | str(physical_drive.get('DG')).strip(), | ||
168 | str(physical_drive.get('State')).strip()) | ||
169 | |||
170 | drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( | ||
171 | slot) | ||
172 | if enclosure == ' ': | ||
173 | drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot) | ||
174 | try: | ||
175 | info = detailed_info_array[drive_identifier + ' - Detailed Information'] | ||
176 | state = info[drive_identifier + ' State'] | ||
177 | attributes = info[drive_identifier + ' Device attributes'] | ||
178 | settings = info[drive_identifier + ' Policies/Settings'] | ||
179 | |||
180 | add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) | ||
181 | add_metric('pd_media_errors', pd_baselabel, state['Media Error Count']) | ||
182 | add_metric('pd_other_errors', pd_baselabel, state['Other Error Count']) | ||
183 | add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count']) | ||
184 | add_metric('pd_smart_alerted', pd_baselabel, | ||
185 | int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) | ||
186 | add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) | ||
187 | add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0]) | ||
188 | add_metric('pd_commissioned_spare', pd_baselabel, | ||
189 | int(settings['Commissioned Spare'] == 'Yes')) | ||
190 | add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes')) | ||
191 | pd_info_label += ',firmware="{0}"'.format(attributes['Firmware Revision'].strip()) | ||
192 | except KeyError: | ||
193 | pass | ||
194 | add_metric('pd_info', pd_info_label, 1) | ||
195 | |||
196 | |||
197 | def add_metric(name, labels, value): | ||
198 | global metric_list | ||
199 | try: | ||
200 | metric_list[name].append({ | ||
201 | 'labels': labels, | ||
202 | 'value': float(value), | ||
203 | }) | ||
204 | except ValueError: | ||
205 | pass | ||
206 | |||
207 | |||
208 | def print_all_metrics(metrics): | ||
209 | for metric, measurements in metrics.items(): | ||
210 | print('# HELP {0}{1} MegaRAID {2}'.format(metric_prefix, metric, metric.replace('_', ' '))) | ||
211 | print('# TYPE {0}{1} gauge'.format(metric_prefix, metric)) | ||
212 | for measurement in measurements: | ||
213 | if measurement['value'] != 'Unknown': | ||
214 | print('{0}{1}{2} {3}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}', | ||
215 | measurement['value'])) | ||
216 | |||
217 | |||
218 | def get_storcli_json(storcli_args): | ||
219 | """Get storcli output in JSON format.""" | ||
220 | # Check if storcli is installed and executable | ||
221 | if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)): | ||
222 | SystemExit(1) | ||
223 | storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args) | ||
224 | proc = subprocess.Popen( | ||
225 | storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
226 | output_json = proc.communicate()[0] | ||
227 | data = json.loads(output_json.decode("utf-8")) | ||
228 | |||
229 | if data["Controllers"][0]["Command Status"]["Status"] != "Success": | ||
230 | SystemExit(1) | ||
231 | return data | ||
232 | |||
233 | |||
234 | if __name__ == "__main__": | ||
235 | PARSER = argparse.ArgumentParser( | ||
236 | description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
237 | PARSER.add_argument( | ||
238 | '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary') | ||
239 | PARSER.add_argument('--version', action='version', version='%(prog)s {0}'.format(VERSION)) | ||
240 | ARGS = PARSER.parse_args() | ||
241 | |||
242 | main(ARGS) | ||
diff --git a/text_collector_examples/yum.sh b/text_collector_examples/yum.sh deleted file mode 100755 index d0034ee..0000000 --- a/text_collector_examples/yum.sh +++ /dev/null | |||
@@ -1,18 +0,0 @@ | |||
1 | #!/bin/bash | ||
2 | # | ||
3 | # Description: Expose metrics from yum updates. | ||
4 | # | ||
5 | # Author: Slawomir Gonet <slawek@otwiera.cz> | ||
6 | # | ||
7 | # Based on apt.sh by Ben Kochie <superq@gmail.com> | ||
8 | |||
9 | upgrades=$(/usr/bin/yum -q check-updates | awk 'BEGIN { mute=1 } /Obsoleting Packages/ { mute=0 } mute { print }' | egrep '^\w+\.\w+' | awk '{print $3}' | sort | uniq -c | awk '{print "yum_upgrades_pending{origin=\""$2"\"} "$1}') | ||
10 | |||
11 | echo '# HELP yum_upgrades_pending Yum package pending updates by origin.' | ||
12 | echo '# TYPE yum_upgrades_pending gauge' | ||
13 | if [[ -n "${upgrades}" ]] ; then | ||
14 | echo "${upgrades}" | ||
15 | else | ||
16 | echo 'yum_upgrades_pending{origin=""} 0' | ||
17 | fi | ||
18 | |||