aboutsummaryrefslogtreecommitdiff
path: root/collector
diff options
context:
space:
mode:
authorDaniel Hodges <hodges.daniel.scott@gmail.com>2020-04-17 06:02:08 -0400
committerGitHub <noreply@github.com>2020-04-17 12:02:08 +0200
commitb14168cf6ad2fd40bbe53b29eebea149dae31105 (patch)
tree49fb89f1426227e12750fc368118a13864da9bb9 /collector
parent44357ed677f7845ab8d202bfc277f341b63e1fdc (diff)
downloadprometheus_node_collector-b14168cf6ad2fd40bbe53b29eebea149dae31105.tar.bz2
prometheus_node_collector-b14168cf6ad2fd40bbe53b29eebea149dae31105.tar.xz
prometheus_node_collector-b14168cf6ad2fd40bbe53b29eebea149dae31105.zip
Add perf tracepoint collection flag (#1664)
* Add tracepoint collector option for perf collector Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
Diffstat (limited to 'collector')
-rw-r--r--collector/perf_linux.go185
-rw-r--r--collector/perf_linux_test.go70
2 files changed, 238 insertions, 17 deletions
diff --git a/collector/perf_linux.go b/collector/perf_linux.go
index 6d19683..e452754 100644
--- a/collector/perf_linux.go
+++ b/collector/perf_linux.go
@@ -20,8 +20,10 @@ import (
20 "strings" 20 "strings"
21 21
22 "github.com/go-kit/kit/log" 22 "github.com/go-kit/kit/log"
23 "github.com/go-kit/kit/log/level"
23 "github.com/hodgesds/perf-utils" 24 "github.com/hodgesds/perf-utils"
24 "github.com/prometheus/client_golang/prometheus" 25 "github.com/prometheus/client_golang/prometheus"
26 "golang.org/x/sys/unix"
25 kingpin "gopkg.in/alecthomas/kingpin.v2" 27 kingpin "gopkg.in/alecthomas/kingpin.v2"
26) 28)
27 29
@@ -30,27 +32,29 @@ const (
30) 32)
31 33
32var ( 34var (
33 perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() 35 perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String()
36 perfTracepointFlag = kingpin.Flag("collector.perf.tracepoint", "perf tracepoint that should be collected").Strings()
34) 37)
35 38
36func init() { 39func init() {
37 registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) 40 registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
38} 41}
39 42
40// perfCollector is a Collector that uses the perf subsystem to collect 43// perfTracepointFlagToTracepoints returns the set of configured tracepoints.
41// metrics. It uses perf_event_open an ioctls for profiling. Due to the fact 44func perfTracepointFlagToTracepoints(tracepointsFlag []string) ([]*perfTracepoint, error) {
42// that the perf subsystem is highly dependent on kernel configuration and 45 tracepoints := make([]*perfTracepoint, len(tracepointsFlag))
43// settings not all profiler values may be exposed on the target system at any 46
44// given time. 47 for i, tracepoint := range tracepointsFlag {
45type perfCollector struct { 48 split := strings.Split(tracepoint, ":")
46 hwProfilerCPUMap map[*perf.HardwareProfiler]int 49 if len(split) != 2 {
47 swProfilerCPUMap map[*perf.SoftwareProfiler]int 50 return nil, fmt.Errorf("Invalid tracepoint config %v", tracepoint)
48 cacheProfilerCPUMap map[*perf.CacheProfiler]int 51 }
49 perfHwProfilers map[int]*perf.HardwareProfiler 52 tracepoints[i] = &perfTracepoint{
50 perfSwProfilers map[int]*perf.SoftwareProfiler 53 subsystem: split[0],
51 perfCacheProfilers map[int]*perf.CacheProfiler 54 event: split[1],
52 desc map[string]*prometheus.Desc 55 }
53 logger log.Logger 56 }
57 return tracepoints, nil
54} 58}
55 59
56// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. 60// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor.
@@ -98,6 +102,144 @@ func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) {
98 return cpus, nil 102 return cpus, nil
99} 103}
100 104
105// perfTracepoint is a struct for holding tracepoint information.
106type perfTracepoint struct {
107 subsystem string
108 event string
109}
110
111// label returns the tracepoint name in the format of subsystem_tracepoint.
112func (t *perfTracepoint) label() string {
113 return t.subsystem + "_" + t.event
114}
115
116// tracepoint returns the tracepoint name in the format of subsystem:tracepoint.
117func (t *perfTracepoint) tracepoint() string {
118 return t.subsystem + ":" + t.event
119}
120
121// perfCollector is a Collector that uses the perf subsystem to collect
122// metrics. It uses perf_event_open an ioctls for profiling. Due to the fact
123// that the perf subsystem is highly dependent on kernel configuration and
124// settings not all profiler values may be exposed on the target system at any
125// given time.
126type perfCollector struct {
127 hwProfilerCPUMap map[*perf.HardwareProfiler]int
128 swProfilerCPUMap map[*perf.SoftwareProfiler]int
129 cacheProfilerCPUMap map[*perf.CacheProfiler]int
130 perfHwProfilers map[int]*perf.HardwareProfiler
131 perfSwProfilers map[int]*perf.SoftwareProfiler
132 perfCacheProfilers map[int]*perf.CacheProfiler
133 desc map[string]*prometheus.Desc
134 logger log.Logger
135 tracepointCollector *perfTracepointCollector
136}
137
138type perfTracepointCollector struct {
139 // desc is the mapping of subsystem to tracepoint *prometheus.Desc.
140 descs map[string]map[string]*prometheus.Desc
141 // collection order is the sorted configured collection order of the profiler.
142 collectionOrder []string
143
144 logger log.Logger
145 profilers map[int]perf.GroupProfiler
146}
147
148// update is used collect all tracepoints across all tracepoint profilers.
149func (c *perfTracepointCollector) update(ch chan<- prometheus.Metric) error {
150 for cpu := range c.profilers {
151 if err := c.updateCPU(cpu, ch); err != nil {
152 return err
153 }
154 }
155 return nil
156}
157
158// updateCPU is used to update metrics per CPU profiler.
159func (c *perfTracepointCollector) updateCPU(cpu int, ch chan<- prometheus.Metric) error {
160 cpuStr := fmt.Sprintf("%d", cpu)
161 profiler := c.profilers[cpu]
162 p, err := profiler.Profile()
163 if err != nil {
164 level.Error(c.logger).Log("msg", "Failed to collect tracepoint profile", "err", err)
165 return err
166 }
167
168 for i, value := range p.Values {
169 // Get the Desc from the ordered group value.
170 descKey := c.collectionOrder[i]
171 descKeySlice := strings.Split(descKey, ":")
172 ch <- prometheus.MustNewConstMetric(
173 c.descs[descKeySlice[0]][descKeySlice[1]],
174 prometheus.CounterValue,
175 float64(value),
176 cpuStr,
177 )
178 }
179 return nil
180}
181
182// newPerfTracepointCollector returns a configured perfTracepointCollector.
183func newPerfTracepointCollector(
184 logger log.Logger,
185 tracepointsFlag []string,
186 cpus []int,
187) (*perfTracepointCollector, error) {
188 tracepoints, err := perfTracepointFlagToTracepoints(tracepointsFlag)
189 if err != nil {
190 return nil, err
191 }
192
193 collectionOrder := make([]string, len(tracepoints))
194 descs := map[string]map[string]*prometheus.Desc{}
195 eventAttrs := make([]unix.PerfEventAttr, len(tracepoints))
196
197 for i, tracepoint := range tracepoints {
198 eventAttr, err := perf.TracepointEventAttr(tracepoint.subsystem, tracepoint.event)
199 if err != nil {
200 return nil, err
201 }
202 eventAttrs[i] = *eventAttr
203 collectionOrder[i] = tracepoint.tracepoint()
204 if _, ok := descs[tracepoint.subsystem]; !ok {
205 descs[tracepoint.subsystem] = map[string]*prometheus.Desc{}
206 }
207 descs[tracepoint.subsystem][tracepoint.event] = prometheus.NewDesc(
208 prometheus.BuildFQName(
209 namespace,
210 perfSubsystem,
211 tracepoint.label(),
212 ),
213 "Perf tracepoint "+tracepoint.tracepoint(),
214 []string{"cpu"},
215 nil,
216 )
217 }
218
219 profilers := make(map[int]perf.GroupProfiler, len(cpus))
220 for _, cpu := range cpus {
221 profiler, err := perf.NewGroupProfiler(-1, cpu, 0, eventAttrs...)
222 if err != nil {
223 return nil, err
224 }
225 profilers[cpu] = profiler
226 }
227
228 c := &perfTracepointCollector{
229 descs: descs,
230 collectionOrder: collectionOrder,
231 profilers: profilers,
232 logger: logger,
233 }
234
235 for _, profiler := range c.profilers {
236 if err := profiler.Start(); err != nil {
237 return nil, err
238 }
239 }
240 return c, nil
241}
242
101// NewPerfCollector returns a new perf based collector, it creates a profiler 243// NewPerfCollector returns a new perf based collector, it creates a profiler
102// per CPU. 244// per CPU.
103func NewPerfCollector(logger log.Logger) (Collector, error) { 245func NewPerfCollector(logger log.Logger) (Collector, error) {
@@ -127,6 +269,16 @@ func NewPerfCollector(logger log.Logger) (Collector, error) {
127 } 269 }
128 } 270 }
129 271
272 // First configure any tracepoints.
273 if *perfTracepointFlag != nil && len(*perfTracepointFlag) > 0 {
274 tracepointCollector, err := newPerfTracepointCollector(logger, *perfTracepointFlag, cpus)
275 if err != nil {
276 return nil, err
277 }
278 collector.tracepointCollector = tracepointCollector
279 }
280
281 // Configure all profilers for the specified CPUs.
130 for _, cpu := range cpus { 282 for _, cpu := range cpus {
131 // Use -1 to profile all processes on the CPU, see: 283 // Use -1 to profile all processes on the CPU, see:
132 // man perf_event_open 284 // man perf_event_open
@@ -411,6 +563,9 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
411 if err := c.updateCacheStats(ch); err != nil { 563 if err := c.updateCacheStats(ch); err != nil {
412 return err 564 return err
413 } 565 }
566 if c.tracepointCollector != nil {
567 return c.tracepointCollector.update(ch)
568 }
414 569
415 return nil 570 return nil
416} 571}
diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go
index fca5455..b384a65 100644
--- a/collector/perf_linux_test.go
+++ b/collector/perf_linux_test.go
@@ -180,8 +180,74 @@ func TestPerfCPUFlagToCPUs(t *testing.T) {
180 if test.exCpus[i] != cpus[i] { 180 if test.exCpus[i] != cpus[i] {
181 t.Fatalf( 181 t.Fatalf(
182 "expected cpus %v, got %v", 182 "expected cpus %v, got %v",
183 test.exCpus, 183 test.exCpus[i],
184 cpus, 184 cpus[i],
185 )
186 }
187 }
188 })
189 }
190}
191
192func TestPerfTracepointFlagToTracepoints(t *testing.T) {
193 tests := []struct {
194 name string
195 flag []string
196 exTracepoints []*perfTracepoint
197 errStr string
198 }{
199 {
200 name: "valid single tracepoint",
201 flag: []string{"sched:sched_kthread_stop"},
202 exTracepoints: []*perfTracepoint{
203 {
204 subsystem: "sched",
205 event: "sched_kthread_stop",
206 },
207 },
208 },
209 {
210 name: "valid multiple tracepoints",
211 flag: []string{"sched:sched_kthread_stop", "sched:sched_process_fork"},
212 exTracepoints: []*perfTracepoint{
213 {
214 subsystem: "sched",
215 event: "sched_kthread_stop",
216 },
217 {
218 subsystem: "sched",
219 event: "sched_process_fork",
220 },
221 },
222 },
223 }
224
225 for _, test := range tests {
226 t.Run(test.name, func(t *testing.T) {
227 tracepoints, err := perfTracepointFlagToTracepoints(test.flag)
228 if test.errStr != "" {
229 if err != nil {
230 t.Fatal("expected error to not be nil")
231 }
232 if test.errStr != err.Error() {
233 t.Fatalf(
234 "expected error %q, got %q",
235 test.errStr,
236 err.Error(),
237 )
238 }
239 return
240 }
241 if err != nil {
242 t.Fatal(err)
243 }
244 for i := range tracepoints {
245 if test.exTracepoints[i].event != tracepoints[i].event &&
246 test.exTracepoints[i].subsystem != tracepoints[i].subsystem {
247 t.Fatalf(
248 "expected tracepoint %v, got %v",
249 test.exTracepoints[i],
250 tracepoints[i],
185 ) 251 )
186 } 252 }
187 } 253 }