diff options
author | Daniele Sluijters <daenney@users.noreply.github.com> | 2019-04-18 11:19:20 +0100 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2019-04-18 12:19:20 +0200 |
commit | cc2fd82008d51b0f40abe69f8993011a533de2f3 (patch) | |
tree | aef7bcd9b5e3f150092650c147b57ffb35dc6e15 | |
parent | 4e5c4d464fa67e9cdfd9858d2151bc99603b2bff (diff) | |
download | prometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.tar.bz2 prometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.tar.xz prometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.zip |
Expose /proc/pressure (#1261)
This enables the collection of pressure stall information as exposed
by the `/proc/pressure` interface added in the 4.20 release of the
Linux kernel.
Closes #1174
Signed-off-by: Daniele Sluijters <daenney@users.noreply.github.com>
-rw-r--r-- | CHANGELOG.md | 1 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | collector/fixtures/e2e-64k-page-output.txt | 16 | ||||
-rw-r--r-- | collector/fixtures/e2e-output.txt | 16 | ||||
-rw-r--r-- | collector/fixtures/proc/pressure/cpu | 1 | ||||
-rw-r--r-- | collector/fixtures/proc/pressure/io | 2 | ||||
-rw-r--r-- | collector/fixtures/proc/pressure/memory | 2 | ||||
-rw-r--r-- | collector/pressure_linux.go | 105 | ||||
-rwxr-xr-x | end-to-end-test.sh | 1 |
9 files changed, 145 insertions, 0 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 154b4b7..d86dbcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md | |||
@@ -29,6 +29,7 @@ | |||
29 | * [FEATURE] Add uname collector for FreeBSD #1239 | 29 | * [FEATURE] Add uname collector for FreeBSD #1239 |
30 | * [FEATURE] Add diskstats collector for OpenBSD #1250 | 30 | * [FEATURE] Add diskstats collector for OpenBSD #1250 |
31 | * [CHANGE] Bonding state uses mii_status #1124 | 31 | * [CHANGE] Bonding state uses mii_status #1124 |
32 | * [FEATURE] Add pressure collector exposing pressure stall information for Linux #1174 | ||
32 | 33 | ||
33 | ## 0.17.0 / 2018-11-30 | 34 | ## 0.17.0 / 2018-11-30 |
34 | 35 | ||
@@ -73,6 +73,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So | |||
73 | meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux | 73 | meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux |
74 | mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux | 74 | mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux |
75 | ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_ | 75 | ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_ |
76 | pressure | Exposes pressure stall statistics from `/proc/pressure/`. | Linux (kernel 4.20+ and/or [CONFIG\_PSI](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/accounting/psi.txt)) | ||
76 | processes | Exposes aggregate process statistics from `/proc`. | Linux | 77 | processes | Exposes aggregate process statistics from `/proc`. | Linux |
77 | qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux | 78 | qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux |
78 | runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_ | 79 | runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_ |
diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index ea59731..47e8f9a 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt | |||
@@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628 | |||
2289 | # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. | 2289 | # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. |
2290 | # TYPE node_nfsd_server_threads gauge | 2290 | # TYPE node_nfsd_server_threads gauge |
2291 | node_nfsd_server_threads 8 | 2291 | node_nfsd_server_threads 8 |
2292 | # HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time | ||
2293 | # TYPE node_pressure_cpu_waiting_seconds_total counter | ||
2294 | node_pressure_cpu_waiting_seconds_total 14.036781000000001 | ||
2295 | # HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion | ||
2296 | # TYPE node_pressure_io_stalled_seconds_total counter | ||
2297 | node_pressure_io_stalled_seconds_total 159.229614 | ||
2298 | # HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion | ||
2299 | # TYPE node_pressure_io_waiting_seconds_total counter | ||
2300 | node_pressure_io_waiting_seconds_total 159.886802 | ||
2301 | # HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion | ||
2302 | # TYPE node_pressure_memory_stalled_seconds_total counter | ||
2303 | node_pressure_memory_stalled_seconds_total 0 | ||
2304 | # HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory | ||
2305 | # TYPE node_pressure_memory_waiting_seconds_total counter | ||
2306 | node_pressure_memory_waiting_seconds_total 0 | ||
2292 | # HELP node_processes_max_processes Number of max PIDs limit | 2307 | # HELP node_processes_max_processes Number of max PIDs limit |
2293 | # TYPE node_processes_max_processes gauge | 2308 | # TYPE node_processes_max_processes gauge |
2294 | node_processes_max_processes 123 | 2309 | node_processes_max_processes 123 |
@@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1 | |||
2361 | node_scrape_collector_success{collector="netstat"} 1 | 2376 | node_scrape_collector_success{collector="netstat"} 1 |
2362 | node_scrape_collector_success{collector="nfs"} 1 | 2377 | node_scrape_collector_success{collector="nfs"} 1 |
2363 | node_scrape_collector_success{collector="nfsd"} 1 | 2378 | node_scrape_collector_success{collector="nfsd"} 1 |
2379 | node_scrape_collector_success{collector="pressure"} 1 | ||
2364 | node_scrape_collector_success{collector="processes"} 1 | 2380 | node_scrape_collector_success{collector="processes"} 1 |
2365 | node_scrape_collector_success{collector="qdisc"} 1 | 2381 | node_scrape_collector_success{collector="qdisc"} 1 |
2366 | node_scrape_collector_success{collector="sockstat"} 1 | 2382 | node_scrape_collector_success{collector="sockstat"} 1 |
diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index f4c6d56..147113e 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt | |||
@@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628 | |||
2289 | # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. | 2289 | # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. |
2290 | # TYPE node_nfsd_server_threads gauge | 2290 | # TYPE node_nfsd_server_threads gauge |
2291 | node_nfsd_server_threads 8 | 2291 | node_nfsd_server_threads 8 |
2292 | # HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time | ||
2293 | # TYPE node_pressure_cpu_waiting_seconds_total counter | ||
2294 | node_pressure_cpu_waiting_seconds_total 14.036781000000001 | ||
2295 | # HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion | ||
2296 | # TYPE node_pressure_io_stalled_seconds_total counter | ||
2297 | node_pressure_io_stalled_seconds_total 159.229614 | ||
2298 | # HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion | ||
2299 | # TYPE node_pressure_io_waiting_seconds_total counter | ||
2300 | node_pressure_io_waiting_seconds_total 159.886802 | ||
2301 | # HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion | ||
2302 | # TYPE node_pressure_memory_stalled_seconds_total counter | ||
2303 | node_pressure_memory_stalled_seconds_total 0 | ||
2304 | # HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory | ||
2305 | # TYPE node_pressure_memory_waiting_seconds_total counter | ||
2306 | node_pressure_memory_waiting_seconds_total 0 | ||
2292 | # HELP node_processes_max_processes Number of max PIDs limit | 2307 | # HELP node_processes_max_processes Number of max PIDs limit |
2293 | # TYPE node_processes_max_processes gauge | 2308 | # TYPE node_processes_max_processes gauge |
2294 | node_processes_max_processes 123 | 2309 | node_processes_max_processes 123 |
@@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1 | |||
2361 | node_scrape_collector_success{collector="netstat"} 1 | 2376 | node_scrape_collector_success{collector="netstat"} 1 |
2362 | node_scrape_collector_success{collector="nfs"} 1 | 2377 | node_scrape_collector_success{collector="nfs"} 1 |
2363 | node_scrape_collector_success{collector="nfsd"} 1 | 2378 | node_scrape_collector_success{collector="nfsd"} 1 |
2379 | node_scrape_collector_success{collector="pressure"} 1 | ||
2364 | node_scrape_collector_success{collector="processes"} 1 | 2380 | node_scrape_collector_success{collector="processes"} 1 |
2365 | node_scrape_collector_success{collector="qdisc"} 1 | 2381 | node_scrape_collector_success{collector="qdisc"} 1 |
2366 | node_scrape_collector_success{collector="sockstat"} 1 | 2382 | node_scrape_collector_success{collector="sockstat"} 1 |
diff --git a/collector/fixtures/proc/pressure/cpu b/collector/fixtures/proc/pressure/cpu new file mode 100644 index 0000000..14acc3a --- /dev/null +++ b/collector/fixtures/proc/pressure/cpu | |||
@@ -0,0 +1 @@ | |||
some avg10=0.00 avg60=0.00 avg300=0.00 total=14036781 | |||
diff --git a/collector/fixtures/proc/pressure/io b/collector/fixtures/proc/pressure/io new file mode 100644 index 0000000..4cdc413 --- /dev/null +++ b/collector/fixtures/proc/pressure/io | |||
@@ -0,0 +1,2 @@ | |||
1 | some avg10=0.18 avg60=0.34 avg300=0.10 total=159886802 | ||
2 | full avg10=0.18 avg60=0.34 avg300=0.10 total=159229614 | ||
diff --git a/collector/fixtures/proc/pressure/memory b/collector/fixtures/proc/pressure/memory new file mode 100644 index 0000000..30c03cc --- /dev/null +++ b/collector/fixtures/proc/pressure/memory | |||
@@ -0,0 +1,2 @@ | |||
1 | some avg10=0.00 avg60=0.00 avg300=0.00 total=0 | ||
2 | full avg10=0.00 avg60=0.00 avg300=0.00 total=0 | ||
diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go new file mode 100644 index 0000000..90b20f8 --- /dev/null +++ b/collector/pressure_linux.go | |||
@@ -0,0 +1,105 @@ | |||
1 | // Copyright 2019 The Prometheus Authors | ||
2 | // Licensed under the Apache License, Version 2.0 (the "License"); | ||
3 | // you may not use this file except in compliance with the License. | ||
4 | // You may obtain a copy of the License at | ||
5 | // | ||
6 | // http://www.apache.org/licenses/LICENSE-2.0 | ||
7 | // | ||
8 | // Unless required by applicable law or agreed to in writing, software | ||
9 | // distributed under the License is distributed on an "AS IS" BASIS, | ||
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
11 | // See the License for the specific language governing permissions and | ||
12 | // limitations under the License. | ||
13 | |||
14 | // +build !nopressure | ||
15 | |||
16 | package collector | ||
17 | |||
18 | import ( | ||
19 | "fmt" | ||
20 | |||
21 | "github.com/prometheus/client_golang/prometheus" | ||
22 | "github.com/prometheus/common/log" | ||
23 | "github.com/prometheus/procfs" | ||
24 | ) | ||
25 | |||
26 | var ( | ||
27 | psiResources = []string{"cpu", "io", "memory"} | ||
28 | ) | ||
29 | |||
30 | type pressureStatsCollector struct { | ||
31 | cpu *prometheus.Desc | ||
32 | io *prometheus.Desc | ||
33 | ioFull *prometheus.Desc | ||
34 | mem *prometheus.Desc | ||
35 | memFull *prometheus.Desc | ||
36 | |||
37 | fs procfs.FS | ||
38 | } | ||
39 | |||
40 | func init() { | ||
41 | registerCollector("pressure", defaultEnabled, NewPressureStatsCollector) | ||
42 | } | ||
43 | |||
44 | // NewPressureStatsCollector returns a Collector exposing pressure stall information | ||
45 | func NewPressureStatsCollector() (Collector, error) { | ||
46 | fs, err := procfs.NewFS(*procPath) | ||
47 | if err != nil { | ||
48 | return nil, fmt.Errorf("failed to open procfs: %v", err) | ||
49 | } | ||
50 | |||
51 | return &pressureStatsCollector{ | ||
52 | cpu: prometheus.NewDesc( | ||
53 | prometheus.BuildFQName(namespace, "pressure", "cpu_waiting_seconds_total"), | ||
54 | "Total time in seconds that processes have waited for CPU time", | ||
55 | nil, nil, | ||
56 | ), | ||
57 | io: prometheus.NewDesc( | ||
58 | prometheus.BuildFQName(namespace, "pressure", "io_waiting_seconds_total"), | ||
59 | "Total time in seconds that processes have waited due to IO congestion", | ||
60 | nil, nil, | ||
61 | ), | ||
62 | ioFull: prometheus.NewDesc( | ||
63 | prometheus.BuildFQName(namespace, "pressure", "io_stalled_seconds_total"), | ||
64 | "Total time in seconds no process could make progress due to IO congestion", | ||
65 | nil, nil, | ||
66 | ), | ||
67 | mem: prometheus.NewDesc( | ||
68 | prometheus.BuildFQName(namespace, "pressure", "memory_waiting_seconds_total"), | ||
69 | "Total time in seconds that processes have waited for memory", | ||
70 | nil, nil, | ||
71 | ), | ||
72 | memFull: prometheus.NewDesc( | ||
73 | prometheus.BuildFQName(namespace, "pressure", "memory_stalled_seconds_total"), | ||
74 | "Total time in seconds no process could make progress due to memory congestion", | ||
75 | nil, nil, | ||
76 | ), | ||
77 | fs: fs, | ||
78 | }, nil | ||
79 | } | ||
80 | |||
81 | // Update calls procfs.NewPSIStatsForResource for the different resources and updates the values | ||
82 | func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { | ||
83 | for _, res := range psiResources { | ||
84 | log.Debugf("collecting statistics for resource: %s", res) | ||
85 | vals, err := c.fs.NewPSIStatsForResource(res) | ||
86 | if err != nil { | ||
87 | log.Debug("pressure information is unavailable, you need a Linux kernel >= 4.20 and/or CONFIG_PSI enabled for your kernel") | ||
88 | return nil | ||
89 | } | ||
90 | switch res { | ||
91 | case "cpu": | ||
92 | ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) | ||
93 | case "io": | ||
94 | ch <- prometheus.MustNewConstMetric(c.io, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) | ||
95 | ch <- prometheus.MustNewConstMetric(c.ioFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) | ||
96 | case "memory": | ||
97 | ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) | ||
98 | ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) | ||
99 | default: | ||
100 | log.Debugf("did not account for resource: %s", res) | ||
101 | } | ||
102 | } | ||
103 | |||
104 | return nil | ||
105 | } | ||
diff --git a/end-to-end-test.sh b/end-to-end-test.sh index ea24cf5..6d3c9f4 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh | |||
@@ -28,6 +28,7 @@ enabled_collectors=$(cat << COLLECTORS | |||
28 | netstat | 28 | netstat |
29 | nfs | 29 | nfs |
30 | nfsd | 30 | nfsd |
31 | pressure | ||
31 | qdisc | 32 | qdisc |
32 | sockstat | 33 | sockstat |
33 | stat | 34 | stat |