aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniele Sluijters <daenney@users.noreply.github.com>2019-04-18 11:19:20 +0100
committerBen Kochie <superq@gmail.com>2019-04-18 12:19:20 +0200
commitcc2fd82008d51b0f40abe69f8993011a533de2f3 (patch)
treeaef7bcd9b5e3f150092650c147b57ffb35dc6e15
parent4e5c4d464fa67e9cdfd9858d2151bc99603b2bff (diff)
downloadprometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.tar.bz2
prometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.tar.xz
prometheus_node_collector-cc2fd82008d51b0f40abe69f8993011a533de2f3.zip
Expose /proc/pressure (#1261)
This enables the collection of pressure stall information as exposed by the `/proc/pressure` interface added in the 4.20 release of the Linux kernel. Closes #1174 Signed-off-by: Daniele Sluijters <daenney@users.noreply.github.com>
-rw-r--r--CHANGELOG.md1
-rw-r--r--README.md1
-rw-r--r--collector/fixtures/e2e-64k-page-output.txt16
-rw-r--r--collector/fixtures/e2e-output.txt16
-rw-r--r--collector/fixtures/proc/pressure/cpu1
-rw-r--r--collector/fixtures/proc/pressure/io2
-rw-r--r--collector/fixtures/proc/pressure/memory2
-rw-r--r--collector/pressure_linux.go105
-rwxr-xr-xend-to-end-test.sh1
9 files changed, 145 insertions, 0 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 154b4b7..d86dbcf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@
29* [FEATURE] Add uname collector for FreeBSD #1239 29* [FEATURE] Add uname collector for FreeBSD #1239
30* [FEATURE] Add diskstats collector for OpenBSD #1250 30* [FEATURE] Add diskstats collector for OpenBSD #1250
31* [CHANGE] Bonding state uses mii_status #1124 31* [CHANGE] Bonding state uses mii_status #1124
32* [FEATURE] Add pressure collector exposing pressure stall information for Linux #1174
32 33
33## 0.17.0 / 2018-11-30 34## 0.17.0 / 2018-11-30
34 35
diff --git a/README.md b/README.md
index 65898d0..4d97efc 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
73meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux 73meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux
74mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux 74mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
75ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_ 75ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_
76pressure | Exposes pressure stall statistics from `/proc/pressure/`. | Linux (kernel 4.20+ and/or [CONFIG\_PSI](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/accounting/psi.txt))
76processes | Exposes aggregate process statistics from `/proc`. | Linux 77processes | Exposes aggregate process statistics from `/proc`. | Linux
77qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux 78qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux
78runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_ 79runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_
diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt
index ea59731..47e8f9a 100644
--- a/collector/fixtures/e2e-64k-page-output.txt
+++ b/collector/fixtures/e2e-64k-page-output.txt
@@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628
2289# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. 2289# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
2290# TYPE node_nfsd_server_threads gauge 2290# TYPE node_nfsd_server_threads gauge
2291node_nfsd_server_threads 8 2291node_nfsd_server_threads 8
2292# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time
2293# TYPE node_pressure_cpu_waiting_seconds_total counter
2294node_pressure_cpu_waiting_seconds_total 14.036781000000001
2295# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion
2296# TYPE node_pressure_io_stalled_seconds_total counter
2297node_pressure_io_stalled_seconds_total 159.229614
2298# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion
2299# TYPE node_pressure_io_waiting_seconds_total counter
2300node_pressure_io_waiting_seconds_total 159.886802
2301# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion
2302# TYPE node_pressure_memory_stalled_seconds_total counter
2303node_pressure_memory_stalled_seconds_total 0
2304# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory
2305# TYPE node_pressure_memory_waiting_seconds_total counter
2306node_pressure_memory_waiting_seconds_total 0
2292# HELP node_processes_max_processes Number of max PIDs limit 2307# HELP node_processes_max_processes Number of max PIDs limit
2293# TYPE node_processes_max_processes gauge 2308# TYPE node_processes_max_processes gauge
2294node_processes_max_processes 123 2309node_processes_max_processes 123
@@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1
2361node_scrape_collector_success{collector="netstat"} 1 2376node_scrape_collector_success{collector="netstat"} 1
2362node_scrape_collector_success{collector="nfs"} 1 2377node_scrape_collector_success{collector="nfs"} 1
2363node_scrape_collector_success{collector="nfsd"} 1 2378node_scrape_collector_success{collector="nfsd"} 1
2379node_scrape_collector_success{collector="pressure"} 1
2364node_scrape_collector_success{collector="processes"} 1 2380node_scrape_collector_success{collector="processes"} 1
2365node_scrape_collector_success{collector="qdisc"} 1 2381node_scrape_collector_success{collector="qdisc"} 1
2366node_scrape_collector_success{collector="sockstat"} 1 2382node_scrape_collector_success{collector="sockstat"} 1
diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt
index f4c6d56..147113e 100644
--- a/collector/fixtures/e2e-output.txt
+++ b/collector/fixtures/e2e-output.txt
@@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628
2289# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. 2289# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
2290# TYPE node_nfsd_server_threads gauge 2290# TYPE node_nfsd_server_threads gauge
2291node_nfsd_server_threads 8 2291node_nfsd_server_threads 8
2292# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time
2293# TYPE node_pressure_cpu_waiting_seconds_total counter
2294node_pressure_cpu_waiting_seconds_total 14.036781000000001
2295# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion
2296# TYPE node_pressure_io_stalled_seconds_total counter
2297node_pressure_io_stalled_seconds_total 159.229614
2298# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion
2299# TYPE node_pressure_io_waiting_seconds_total counter
2300node_pressure_io_waiting_seconds_total 159.886802
2301# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion
2302# TYPE node_pressure_memory_stalled_seconds_total counter
2303node_pressure_memory_stalled_seconds_total 0
2304# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory
2305# TYPE node_pressure_memory_waiting_seconds_total counter
2306node_pressure_memory_waiting_seconds_total 0
2292# HELP node_processes_max_processes Number of max PIDs limit 2307# HELP node_processes_max_processes Number of max PIDs limit
2293# TYPE node_processes_max_processes gauge 2308# TYPE node_processes_max_processes gauge
2294node_processes_max_processes 123 2309node_processes_max_processes 123
@@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1
2361node_scrape_collector_success{collector="netstat"} 1 2376node_scrape_collector_success{collector="netstat"} 1
2362node_scrape_collector_success{collector="nfs"} 1 2377node_scrape_collector_success{collector="nfs"} 1
2363node_scrape_collector_success{collector="nfsd"} 1 2378node_scrape_collector_success{collector="nfsd"} 1
2379node_scrape_collector_success{collector="pressure"} 1
2364node_scrape_collector_success{collector="processes"} 1 2380node_scrape_collector_success{collector="processes"} 1
2365node_scrape_collector_success{collector="qdisc"} 1 2381node_scrape_collector_success{collector="qdisc"} 1
2366node_scrape_collector_success{collector="sockstat"} 1 2382node_scrape_collector_success{collector="sockstat"} 1
diff --git a/collector/fixtures/proc/pressure/cpu b/collector/fixtures/proc/pressure/cpu
new file mode 100644
index 0000000..14acc3a
--- /dev/null
+++ b/collector/fixtures/proc/pressure/cpu
@@ -0,0 +1 @@
some avg10=0.00 avg60=0.00 avg300=0.00 total=14036781
diff --git a/collector/fixtures/proc/pressure/io b/collector/fixtures/proc/pressure/io
new file mode 100644
index 0000000..4cdc413
--- /dev/null
+++ b/collector/fixtures/proc/pressure/io
@@ -0,0 +1,2 @@
1some avg10=0.18 avg60=0.34 avg300=0.10 total=159886802
2full avg10=0.18 avg60=0.34 avg300=0.10 total=159229614
diff --git a/collector/fixtures/proc/pressure/memory b/collector/fixtures/proc/pressure/memory
new file mode 100644
index 0000000..30c03cc
--- /dev/null
+++ b/collector/fixtures/proc/pressure/memory
@@ -0,0 +1,2 @@
1some avg10=0.00 avg60=0.00 avg300=0.00 total=0
2full avg10=0.00 avg60=0.00 avg300=0.00 total=0
diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go
new file mode 100644
index 0000000..90b20f8
--- /dev/null
+++ b/collector/pressure_linux.go
@@ -0,0 +1,105 @@
1// Copyright 2019 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14// +build !nopressure
15
16package collector
17
18import (
19 "fmt"
20
21 "github.com/prometheus/client_golang/prometheus"
22 "github.com/prometheus/common/log"
23 "github.com/prometheus/procfs"
24)
25
26var (
27 psiResources = []string{"cpu", "io", "memory"}
28)
29
30type pressureStatsCollector struct {
31 cpu *prometheus.Desc
32 io *prometheus.Desc
33 ioFull *prometheus.Desc
34 mem *prometheus.Desc
35 memFull *prometheus.Desc
36
37 fs procfs.FS
38}
39
40func init() {
41 registerCollector("pressure", defaultEnabled, NewPressureStatsCollector)
42}
43
44// NewPressureStatsCollector returns a Collector exposing pressure stall information
45func NewPressureStatsCollector() (Collector, error) {
46 fs, err := procfs.NewFS(*procPath)
47 if err != nil {
48 return nil, fmt.Errorf("failed to open procfs: %v", err)
49 }
50
51 return &pressureStatsCollector{
52 cpu: prometheus.NewDesc(
53 prometheus.BuildFQName(namespace, "pressure", "cpu_waiting_seconds_total"),
54 "Total time in seconds that processes have waited for CPU time",
55 nil, nil,
56 ),
57 io: prometheus.NewDesc(
58 prometheus.BuildFQName(namespace, "pressure", "io_waiting_seconds_total"),
59 "Total time in seconds that processes have waited due to IO congestion",
60 nil, nil,
61 ),
62 ioFull: prometheus.NewDesc(
63 prometheus.BuildFQName(namespace, "pressure", "io_stalled_seconds_total"),
64 "Total time in seconds no process could make progress due to IO congestion",
65 nil, nil,
66 ),
67 mem: prometheus.NewDesc(
68 prometheus.BuildFQName(namespace, "pressure", "memory_waiting_seconds_total"),
69 "Total time in seconds that processes have waited for memory",
70 nil, nil,
71 ),
72 memFull: prometheus.NewDesc(
73 prometheus.BuildFQName(namespace, "pressure", "memory_stalled_seconds_total"),
74 "Total time in seconds no process could make progress due to memory congestion",
75 nil, nil,
76 ),
77 fs: fs,
78 }, nil
79}
80
81// Update calls procfs.NewPSIStatsForResource for the different resources and updates the values
82func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error {
83 for _, res := range psiResources {
84 log.Debugf("collecting statistics for resource: %s", res)
85 vals, err := c.fs.NewPSIStatsForResource(res)
86 if err != nil {
87 log.Debug("pressure information is unavailable, you need a Linux kernel >= 4.20 and/or CONFIG_PSI enabled for your kernel")
88 return nil
89 }
90 switch res {
91 case "cpu":
92 ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
93 case "io":
94 ch <- prometheus.MustNewConstMetric(c.io, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
95 ch <- prometheus.MustNewConstMetric(c.ioFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
96 case "memory":
97 ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
98 ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
99 default:
100 log.Debugf("did not account for resource: %s", res)
101 }
102 }
103
104 return nil
105}
diff --git a/end-to-end-test.sh b/end-to-end-test.sh
index ea24cf5..6d3c9f4 100755
--- a/end-to-end-test.sh
+++ b/end-to-end-test.sh
@@ -28,6 +28,7 @@ enabled_collectors=$(cat << COLLECTORS
28 netstat 28 netstat
29 nfs 29 nfs
30 nfsd 30 nfsd
31 pressure
31 qdisc 32 qdisc
32 sockstat 33 sockstat
33 stat 34 stat