diff options
author | Ben Kochie <superq@gmail.com> | 2020-05-23 21:46:54 +0200 |
---|---|---|
committer | Ben Kochie <superq@gmail.com> | 2020-05-24 16:31:26 +0200 |
commit | 3565316d7e306d60ad74248291eaabb3410f2972 (patch) | |
tree | c17f3d60f0674c7794f4d9b633b1628f35c55523 | |
parent | b8847b5b3237c18aa25426fccb3d69a9bdff6ee1 (diff) | |
download | prometheus_node_collector-3565316d7e306d60ad74248291eaabb3410f2972.tar.bz2 prometheus_node_collector-3565316d7e306d60ad74248291eaabb3410f2972.tar.xz prometheus_node_collector-3565316d7e306d60ad74248291eaabb3410f2972.zip |
Linux CPU: Cache CPU metrics
Cache CPU metrics to avoid counters (ie iowait) jumping backwards.
Fixes: https://github.com/prometheus/node_exporter/issues/1686
Signed-off-by: Ben Kochie <superq@gmail.com>
-rw-r--r-- | .circleci/config.yml | 2 | ||||
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | collector/cpu_linux.go | 85 |
3 files changed, 86 insertions, 3 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml index 19c0f1b..018b8b1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml | |||
@@ -28,7 +28,7 @@ jobs: | |||
28 | steps: | 28 | steps: |
29 | - checkout | 29 | - checkout |
30 | - run: sudo pip install codespell | 30 | - run: sudo pip install codespell |
31 | - run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem" -L uint,packages\',uptodate | 31 | - run: codespell --skip=".git,./vendor,ttar,go.mod,go.sum,*pem,./collector/fixtures" -L uint,packages\',uptodate |
32 | 32 | ||
33 | build: | 33 | build: |
34 | machine: | 34 | machine: |
diff --git a/CHANGELOG.md b/CHANGELOG.md index a5d884f..9e05d0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md | |||
@@ -3,7 +3,7 @@ | |||
3 | * [CHANGE] | 3 | * [CHANGE] |
4 | * [FEATURE] | 4 | * [FEATURE] |
5 | * [ENHANCEMENT] | 5 | * [ENHANCEMENT] |
6 | * [BUGFIX] | 6 | * [BUGFIX] Linux CPU: Cache CPU metrics to make them monotonically increasing #1711 |
7 | 7 | ||
8 | ## 1.0.0-rc.1 / 2020-05-14 | 8 | ## 1.0.0-rc.1 / 2020-05-14 |
9 | 9 | ||
diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index ae8ee53..dfa4d4a 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go | |||
@@ -19,6 +19,7 @@ import ( | |||
19 | "fmt" | 19 | "fmt" |
20 | "path/filepath" | 20 | "path/filepath" |
21 | "strconv" | 21 | "strconv" |
22 | "sync" | ||
22 | 23 | ||
23 | "github.com/go-kit/kit/log" | 24 | "github.com/go-kit/kit/log" |
24 | "github.com/go-kit/kit/log/level" | 25 | "github.com/go-kit/kit/log/level" |
@@ -35,6 +36,8 @@ type cpuCollector struct { | |||
35 | cpuCoreThrottle *prometheus.Desc | 36 | cpuCoreThrottle *prometheus.Desc |
36 | cpuPackageThrottle *prometheus.Desc | 37 | cpuPackageThrottle *prometheus.Desc |
37 | logger log.Logger | 38 | logger log.Logger |
39 | cpuStats []procfs.CPUStat | ||
40 | cpuStatsMutex sync.Mutex | ||
38 | } | 41 | } |
39 | 42 | ||
40 | var ( | 43 | var ( |
@@ -203,7 +206,12 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { | |||
203 | return err | 206 | return err |
204 | } | 207 | } |
205 | 208 | ||
206 | for cpuID, cpuStat := range stats.CPU { | 209 | c.updateCPUStats(stats.CPU) |
210 | |||
211 | // Acquire a lock to read the stats. | ||
212 | c.cpuStatsMutex.Lock() | ||
213 | defer c.cpuStatsMutex.Unlock() | ||
214 | for cpuID, cpuStat := range c.cpuStats { | ||
207 | cpuNum := strconv.Itoa(cpuID) | 215 | cpuNum := strconv.Itoa(cpuID) |
208 | ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user") | 216 | ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user") |
209 | ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice") | 217 | ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice") |
@@ -221,3 +229,78 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { | |||
221 | 229 | ||
222 | return nil | 230 | return nil |
223 | } | 231 | } |
232 | |||
233 | // updateCPUStats updates the internal cache of CPU stats. | ||
234 | func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) { | ||
235 | // Acquire a lock to update the stats. | ||
236 | c.cpuStatsMutex.Lock() | ||
237 | defer c.cpuStatsMutex.Unlock() | ||
238 | |||
239 | // Reset the cache if the list of CPUs has changed. | ||
240 | if len(c.cpuStats) != len(newStats) { | ||
241 | c.cpuStats = make([]procfs.CPUStat, len(newStats)) | ||
242 | } | ||
243 | |||
244 | for i, n := range newStats { | ||
245 | // If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU. | ||
246 | if n.Idle < c.cpuStats[i].Idle { | ||
247 | level.Warn(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle) | ||
248 | c.cpuStats[i] = procfs.CPUStat{} | ||
249 | } | ||
250 | c.cpuStats[i].Idle = n.Idle | ||
251 | |||
252 | if n.User >= c.cpuStats[i].User { | ||
253 | c.cpuStats[i].User = n.User | ||
254 | } else { | ||
255 | level.Warn(c.logger).Log("msg", "CPU User counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].User, "new_value", n.User) | ||
256 | } | ||
257 | |||
258 | if n.Nice >= c.cpuStats[i].Nice { | ||
259 | c.cpuStats[i].Nice = n.Nice | ||
260 | } else { | ||
261 | level.Warn(c.logger).Log("msg", "CPU Nice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Nice, "new_value", n.Nice) | ||
262 | } | ||
263 | |||
264 | if n.System >= c.cpuStats[i].System { | ||
265 | c.cpuStats[i].System = n.System | ||
266 | } else { | ||
267 | level.Warn(c.logger).Log("msg", "CPU System counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].System, "new_value", n.System) | ||
268 | } | ||
269 | |||
270 | if n.Iowait >= c.cpuStats[i].Iowait { | ||
271 | c.cpuStats[i].Iowait = n.Iowait | ||
272 | } else { | ||
273 | level.Warn(c.logger).Log("msg", "CPU Iowait counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Iowait, "new_value", n.Iowait) | ||
274 | } | ||
275 | |||
276 | if n.IRQ >= c.cpuStats[i].IRQ { | ||
277 | c.cpuStats[i].IRQ = n.IRQ | ||
278 | } else { | ||
279 | level.Warn(c.logger).Log("msg", "CPU IRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].IRQ, "new_value", n.IRQ) | ||
280 | } | ||
281 | |||
282 | if n.SoftIRQ >= c.cpuStats[i].SoftIRQ { | ||
283 | c.cpuStats[i].SoftIRQ = n.SoftIRQ | ||
284 | } else { | ||
285 | level.Warn(c.logger).Log("msg", "CPU SoftIRQ counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].SoftIRQ, "new_value", n.SoftIRQ) | ||
286 | } | ||
287 | |||
288 | if n.Steal >= c.cpuStats[i].Steal { | ||
289 | c.cpuStats[i].Steal = n.Steal | ||
290 | } else { | ||
291 | level.Warn(c.logger).Log("msg", "CPU Steal counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Steal, "new_value", n.Steal) | ||
292 | } | ||
293 | |||
294 | if n.Guest >= c.cpuStats[i].Guest { | ||
295 | c.cpuStats[i].Guest = n.Guest | ||
296 | } else { | ||
297 | level.Warn(c.logger).Log("msg", "CPU Guest counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Guest, "new_value", n.Guest) | ||
298 | } | ||
299 | |||
300 | if n.GuestNice >= c.cpuStats[i].GuestNice { | ||
301 | c.cpuStats[i].GuestNice = n.GuestNice | ||
302 | } else { | ||
303 | level.Warn(c.logger).Log("msg", "CPU GuestNice counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].GuestNice, "new_value", n.GuestNice) | ||
304 | } | ||
305 | } | ||
306 | } | ||