aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Kochie <superq@gmail.com>2018-08-13 17:27:23 +0200
committerGitHub <noreply@github.com>2018-08-13 17:27:23 +0200
commitfe5a1178313b2f56e53d4194ae306373d92a208a (patch)
treeb5523e468e29cc439a09cda142807fadb8187a8a
parent099c1527f191e9715a42d338f0d89e34690a6d5b (diff)
downloadprometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.tar.bz2
prometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.tar.xz
prometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.zip
Handle vanishing PIDs (#1043)
PIDs can vanish (exit) from /proc/ between gathering the list of PIDs and getting all of their stats. * Ignore file not found errors. * Explicitly count the PIDs we find. * Cleanup some error style issues. Signed-off-by: Ben Kochie <superq@gmail.com>
-rw-r--r--CHANGELOG.md2
-rw-r--r--collector/fixtures/proc/11/.missing_stat0
-rw-r--r--collector/processes_linux.go18
3 files changed, 15 insertions, 5 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a447d6..7758cc9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,9 +14,9 @@ The wifi collector is disabled by default due to suspected caching issues and go
14* [FEATURE] Add socket unit stats to systemd collector #968 14* [FEATURE] Add socket unit stats to systemd collector #968
15* [FEATURE] Collect start time for systemd units 15* [FEATURE] Collect start time for systemd units
16* [ENHANCEMENT] 16* [ENHANCEMENT]
17* [BUGFIX]
18 17
19* [BUGFIX] Fix goroutine leak in supervisord collector 18* [BUGFIX] Fix goroutine leak in supervisord collector
19* [BUGFIX] Handle vanishing PIDs #1043
20 20
21## 0.16.0 / 2018-05-15 21## 0.16.0 / 2018-05-15
22 22
diff --git a/collector/fixtures/proc/11/.missing_stat b/collector/fixtures/proc/11/.missing_stat
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/collector/fixtures/proc/11/.missing_stat
diff --git a/collector/processes_linux.go b/collector/processes_linux.go
index fd6ddb1..344844b 100644
--- a/collector/processes_linux.go
+++ b/collector/processes_linux.go
@@ -17,7 +17,10 @@ package collector
17 17
18import ( 18import (
19 "fmt" 19 "fmt"
20 "os"
21
20 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus"
23 "github.com/prometheus/common/log"
21 "github.com/prometheus/procfs" 24 "github.com/prometheus/procfs"
22) 25)
23 26
@@ -62,13 +65,13 @@ func NewProcessStatCollector() (Collector, error) {
62func (t *processCollector) Update(ch chan<- prometheus.Metric) error { 65func (t *processCollector) Update(ch chan<- prometheus.Metric) error {
63 pids, states, threads, err := getAllocatedThreads() 66 pids, states, threads, err := getAllocatedThreads()
64 if err != nil { 67 if err != nil {
65 return fmt.Errorf("Unable to retrieve number of allocated threads %v\n", err) 68 return fmt.Errorf("unable to retrieve number of allocated threads: %q", err)
66 } 69 }
67 70
68 ch <- prometheus.MustNewConstMetric(t.threadAlloc, prometheus.GaugeValue, float64(threads)) 71 ch <- prometheus.MustNewConstMetric(t.threadAlloc, prometheus.GaugeValue, float64(threads))
69 maxThreads, err := readUintFromFile(procFilePath("sys/kernel/threads-max")) 72 maxThreads, err := readUintFromFile(procFilePath("sys/kernel/threads-max"))
70 if err != nil { 73 if err != nil {
71 return fmt.Errorf("Unable to retrieve limit number of threads %v\n", err) 74 return fmt.Errorf("unable to retrieve limit number of threads: %q", err)
72 } 75 }
73 ch <- prometheus.MustNewConstMetric(t.threadLimit, prometheus.GaugeValue, float64(maxThreads)) 76 ch <- prometheus.MustNewConstMetric(t.threadLimit, prometheus.GaugeValue, float64(maxThreads))
74 77
@@ -78,7 +81,7 @@ func (t *processCollector) Update(ch chan<- prometheus.Metric) error {
78 81
79 pidM, err := readUintFromFile(procFilePath("sys/kernel/pid_max")) 82 pidM, err := readUintFromFile(procFilePath("sys/kernel/pid_max"))
80 if err != nil { 83 if err != nil {
81 return fmt.Errorf("Unable to retrieve limit number of maximum pids alloved %v\n", err) 84 return fmt.Errorf("unable to retrieve limit number of maximum pids alloved: %q", err)
82 } 85 }
83 ch <- prometheus.MustNewConstMetric(t.pidUsed, prometheus.GaugeValue, float64(pids)) 86 ch <- prometheus.MustNewConstMetric(t.pidUsed, prometheus.GaugeValue, float64(pids))
84 ch <- prometheus.MustNewConstMetric(t.pidMax, prometheus.GaugeValue, float64(pidM)) 87 ch <- prometheus.MustNewConstMetric(t.pidMax, prometheus.GaugeValue, float64(pidM))
@@ -95,15 +98,22 @@ func getAllocatedThreads() (int, map[string]int32, int, error) {
95 if err != nil { 98 if err != nil {
96 return 0, nil, 0, err 99 return 0, nil, 0, err
97 } 100 }
101 pids := 0
98 thread := 0 102 thread := 0
99 procStates := make(map[string]int32) 103 procStates := make(map[string]int32)
100 for _, pid := range p { 104 for _, pid := range p {
101 stat, err := pid.NewStat() 105 stat, err := pid.NewStat()
106 // PIDs can vanish between getting the list and getting stats.
107 if os.IsNotExist(err) {
108 log.Debugf("file not found when retrieving stats: %q", err)
109 continue
110 }
102 if err != nil { 111 if err != nil {
103 return 0, nil, 0, err 112 return 0, nil, 0, err
104 } 113 }
114 pids += 1
105 procStates[stat.State] += 1 115 procStates[stat.State] += 1
106 thread += stat.NumThreads 116 thread += stat.NumThreads
107 } 117 }
108 return len(p), procStates, thread, nil 118 return pids, procStates, thread, nil
109} 119}