diff options
author | Ben Kochie <superq@gmail.com> | 2018-08-13 17:27:23 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-13 17:27:23 +0200 |
commit | fe5a1178313b2f56e53d4194ae306373d92a208a (patch) | |
tree | b5523e468e29cc439a09cda142807fadb8187a8a | |
parent | 099c1527f191e9715a42d338f0d89e34690a6d5b (diff) | |
download | prometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.tar.bz2 prometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.tar.xz prometheus_node_collector-fe5a1178313b2f56e53d4194ae306373d92a208a.zip |
Handle vanishing PIDs (#1043)
PIDs can vanish (exit) from /proc/ between gathering the list of PIDs
and getting all of their stats.
* Ignore file not found errors.
* Explicitly count the PIDs we find.
* Cleanup some error style issues.
Signed-off-by: Ben Kochie <superq@gmail.com>
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | collector/fixtures/proc/11/.missing_stat | 0 | ||||
-rw-r--r-- | collector/processes_linux.go | 18 |
3 files changed, 15 insertions, 5 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a447d6..7758cc9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md | |||
@@ -14,9 +14,9 @@ The wifi collector is disabled by default due to suspected caching issues and go | |||
14 | * [FEATURE] Add socket unit stats to systemd collector #968 | 14 | * [FEATURE] Add socket unit stats to systemd collector #968 |
15 | * [FEATURE] Collect start time for systemd units | 15 | * [FEATURE] Collect start time for systemd units |
16 | * [ENHANCEMENT] | 16 | * [ENHANCEMENT] |
17 | * [BUGFIX] | ||
18 | 17 | ||
19 | * [BUGFIX] Fix goroutine leak in supervisord collector | 18 | * [BUGFIX] Fix goroutine leak in supervisord collector |
19 | * [BUGFIX] Handle vanishing PIDs #1043 | ||
20 | 20 | ||
21 | ## 0.16.0 / 2018-05-15 | 21 | ## 0.16.0 / 2018-05-15 |
22 | 22 | ||
diff --git a/collector/fixtures/proc/11/.missing_stat b/collector/fixtures/proc/11/.missing_stat new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/collector/fixtures/proc/11/.missing_stat | |||
diff --git a/collector/processes_linux.go b/collector/processes_linux.go index fd6ddb1..344844b 100644 --- a/collector/processes_linux.go +++ b/collector/processes_linux.go | |||
@@ -17,7 +17,10 @@ package collector | |||
17 | 17 | ||
18 | import ( | 18 | import ( |
19 | "fmt" | 19 | "fmt" |
20 | "os" | ||
21 | |||
20 | "github.com/prometheus/client_golang/prometheus" | 22 | "github.com/prometheus/client_golang/prometheus" |
23 | "github.com/prometheus/common/log" | ||
21 | "github.com/prometheus/procfs" | 24 | "github.com/prometheus/procfs" |
22 | ) | 25 | ) |
23 | 26 | ||
@@ -62,13 +65,13 @@ func NewProcessStatCollector() (Collector, error) { | |||
62 | func (t *processCollector) Update(ch chan<- prometheus.Metric) error { | 65 | func (t *processCollector) Update(ch chan<- prometheus.Metric) error { |
63 | pids, states, threads, err := getAllocatedThreads() | 66 | pids, states, threads, err := getAllocatedThreads() |
64 | if err != nil { | 67 | if err != nil { |
65 | return fmt.Errorf("Unable to retrieve number of allocated threads %v\n", err) | 68 | return fmt.Errorf("unable to retrieve number of allocated threads: %q", err) |
66 | } | 69 | } |
67 | 70 | ||
68 | ch <- prometheus.MustNewConstMetric(t.threadAlloc, prometheus.GaugeValue, float64(threads)) | 71 | ch <- prometheus.MustNewConstMetric(t.threadAlloc, prometheus.GaugeValue, float64(threads)) |
69 | maxThreads, err := readUintFromFile(procFilePath("sys/kernel/threads-max")) | 72 | maxThreads, err := readUintFromFile(procFilePath("sys/kernel/threads-max")) |
70 | if err != nil { | 73 | if err != nil { |
71 | return fmt.Errorf("Unable to retrieve limit number of threads %v\n", err) | 74 | return fmt.Errorf("unable to retrieve limit number of threads: %q", err) |
72 | } | 75 | } |
73 | ch <- prometheus.MustNewConstMetric(t.threadLimit, prometheus.GaugeValue, float64(maxThreads)) | 76 | ch <- prometheus.MustNewConstMetric(t.threadLimit, prometheus.GaugeValue, float64(maxThreads)) |
74 | 77 | ||
@@ -78,7 +81,7 @@ func (t *processCollector) Update(ch chan<- prometheus.Metric) error { | |||
78 | 81 | ||
79 | pidM, err := readUintFromFile(procFilePath("sys/kernel/pid_max")) | 82 | pidM, err := readUintFromFile(procFilePath("sys/kernel/pid_max")) |
80 | if err != nil { | 83 | if err != nil { |
81 | return fmt.Errorf("Unable to retrieve limit number of maximum pids alloved %v\n", err) | 84 | return fmt.Errorf("unable to retrieve limit number of maximum pids alloved: %q", err) |
82 | } | 85 | } |
83 | ch <- prometheus.MustNewConstMetric(t.pidUsed, prometheus.GaugeValue, float64(pids)) | 86 | ch <- prometheus.MustNewConstMetric(t.pidUsed, prometheus.GaugeValue, float64(pids)) |
84 | ch <- prometheus.MustNewConstMetric(t.pidMax, prometheus.GaugeValue, float64(pidM)) | 87 | ch <- prometheus.MustNewConstMetric(t.pidMax, prometheus.GaugeValue, float64(pidM)) |
@@ -95,15 +98,22 @@ func getAllocatedThreads() (int, map[string]int32, int, error) { | |||
95 | if err != nil { | 98 | if err != nil { |
96 | return 0, nil, 0, err | 99 | return 0, nil, 0, err |
97 | } | 100 | } |
101 | pids := 0 | ||
98 | thread := 0 | 102 | thread := 0 |
99 | procStates := make(map[string]int32) | 103 | procStates := make(map[string]int32) |
100 | for _, pid := range p { | 104 | for _, pid := range p { |
101 | stat, err := pid.NewStat() | 105 | stat, err := pid.NewStat() |
106 | // PIDs can vanish between getting the list and getting stats. | ||
107 | if os.IsNotExist(err) { | ||
108 | log.Debugf("file not found when retrieving stats: %q", err) | ||
109 | continue | ||
110 | } | ||
102 | if err != nil { | 111 | if err != nil { |
103 | return 0, nil, 0, err | 112 | return 0, nil, 0, err |
104 | } | 113 | } |
114 | pids += 1 | ||
105 | procStates[stat.State] += 1 | 115 | procStates[stat.State] += 1 |
106 | thread += stat.NumThreads | 116 | thread += stat.NumThreads |
107 | } | 117 | } |
108 | return len(p), procStates, thread, nil | 118 | return pids, procStates, thread, nil |
109 | } | 119 | } |