diff options
-rw-r--r-- | CHANGELOG.md | 1 | ||||
-rw-r--r-- | README.md | 12 | ||||
-rw-r--r-- | collector/perf_linux.go | 165 | ||||
-rw-r--r-- | collector/perf_linux_test.go | 73 |
4 files changed, 215 insertions, 36 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index f017b57..68022ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | ### Changes | 15 | ### Changes |
16 | 16 | ||
17 | * [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats. | ||
17 | * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 | 18 | * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 |
18 | * [CHANGE] Refactor mdadm collector #1403 | 19 | * [CHANGE] Refactor mdadm collector #1403 |
19 | * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 | 20 | * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 |
@@ -84,6 +84,18 @@ Depending on the configured value different metrics will be available, for most | |||
84 | cases `0` will provide the most complete set. For more information see [`man 2 | 84 | cases `0` will provide the most complete set. For more information see [`man 2 |
85 | perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). | 85 | perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). |
86 | 86 | ||
87 | By default, the perf collector will only collect metrics of the CPUs that | ||
88 | `node_exporter` is running on (ie | ||
89 | [`runtime.NumCPU`](https://golang.org/pkg/runtime/#NumCPU). If this is | ||
90 | insufficient (e.g. if you run `node_exporter` with its CPU affinity set to | ||
91 | specific CPUs) You can specify a list of alternate CPUs by using the | ||
92 | `--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you | ||
93 | would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU | ||
94 | configuration is zero indexed and can also take a stride value | ||
95 | `--collector.perf --collector.perf.cpus=1-10:5`, would collect on CPUs | ||
96 | 1, 5, and 10. | ||
97 | |||
98 | |||
87 | Name | Description | OS | 99 | Name | Description | OS |
88 | ---------|-------------|---- | 100 | ---------|-------------|---- |
89 | buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux | 101 | buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux |
diff --git a/collector/perf_linux.go b/collector/perf_linux.go index e8a52b4..b67f970 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go | |||
@@ -14,18 +14,25 @@ | |||
14 | package collector | 14 | package collector |
15 | 15 | ||
16 | import ( | 16 | import ( |
17 | "fmt" | ||
17 | "runtime" | 18 | "runtime" |
18 | "strconv" | 19 | "strconv" |
20 | "strings" | ||
19 | 21 | ||
20 | "github.com/go-kit/kit/log" | 22 | "github.com/go-kit/kit/log" |
21 | "github.com/hodgesds/perf-utils" | 23 | "github.com/hodgesds/perf-utils" |
22 | "github.com/prometheus/client_golang/prometheus" | 24 | "github.com/prometheus/client_golang/prometheus" |
25 | kingpin "gopkg.in/alecthomas/kingpin.v2" | ||
23 | ) | 26 | ) |
24 | 27 | ||
25 | const ( | 28 | const ( |
26 | perfSubsystem = "perf" | 29 | perfSubsystem = "perf" |
27 | ) | 30 | ) |
28 | 31 | ||
32 | var ( | ||
33 | perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() | ||
34 | ) | ||
35 | |||
29 | func init() { | 36 | func init() { |
30 | registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) | 37 | registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) |
31 | } | 38 | } |
@@ -36,40 +43,123 @@ func init() { | |||
36 | // settings not all profiler values may be exposed on the target system at any | 43 | // settings not all profiler values may be exposed on the target system at any |
37 | // given time. | 44 | // given time. |
38 | type perfCollector struct { | 45 | type perfCollector struct { |
39 | perfHwProfilers map[int]perf.HardwareProfiler | 46 | hwProfilerCPUMap map[*perf.HardwareProfiler]int |
40 | perfSwProfilers map[int]perf.SoftwareProfiler | 47 | swProfilerCPUMap map[*perf.SoftwareProfiler]int |
41 | perfCacheProfilers map[int]perf.CacheProfiler | 48 | cacheProfilerCPUMap map[*perf.CacheProfiler]int |
42 | desc map[string]*prometheus.Desc | 49 | perfHwProfilers map[int]*perf.HardwareProfiler |
43 | logger log.Logger | 50 | perfSwProfilers map[int]*perf.SoftwareProfiler |
51 | perfCacheProfilers map[int]*perf.CacheProfiler | ||
52 | desc map[string]*prometheus.Desc | ||
53 | logger log.Logger | ||
54 | } | ||
55 | |||
56 | // perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. | ||
57 | func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) { | ||
58 | var err error | ||
59 | cpus := []int{} | ||
60 | for _, subset := range strings.Split(cpuFlag, ",") { | ||
61 | // First parse a single CPU. | ||
62 | if !strings.Contains(subset, "-") { | ||
63 | cpu, err := strconv.Atoi(subset) | ||
64 | if err != nil { | ||
65 | return nil, err | ||
66 | } | ||
67 | cpus = append(cpus, cpu) | ||
68 | continue | ||
69 | } | ||
70 | |||
71 | stride := 1 | ||
72 | // Handle strides, ie 1-10:5 should yield 1,5,10 | ||
73 | strideSet := strings.Split(subset, ":") | ||
74 | if len(strideSet) == 2 { | ||
75 | stride, err = strconv.Atoi(strideSet[1]) | ||
76 | if err != nil { | ||
77 | return nil, err | ||
78 | } | ||
79 | } | ||
80 | |||
81 | rangeSet := strings.Split(strideSet[0], "-") | ||
82 | if len(rangeSet) != 2 { | ||
83 | return nil, fmt.Errorf("invalid flag value %q", cpuFlag) | ||
84 | } | ||
85 | start, err := strconv.Atoi(rangeSet[0]) | ||
86 | if err != nil { | ||
87 | return nil, err | ||
88 | } | ||
89 | end, err := strconv.Atoi(rangeSet[1]) | ||
90 | if err != nil { | ||
91 | return nil, err | ||
92 | } | ||
93 | for i := start; i <= end; i += stride { | ||
94 | cpus = append(cpus, i) | ||
95 | } | ||
96 | } | ||
97 | |||
98 | return cpus, nil | ||
44 | } | 99 | } |
45 | 100 | ||
46 | // NewPerfCollector returns a new perf based collector, it creates a profiler | 101 | // NewPerfCollector returns a new perf based collector, it creates a profiler |
47 | // per CPU. | 102 | // per CPU. |
48 | func NewPerfCollector(logger log.Logger) (Collector, error) { | 103 | func NewPerfCollector(logger log.Logger) (Collector, error) { |
49 | c := &perfCollector{ | 104 | collector := &perfCollector{ |
50 | perfHwProfilers: map[int]perf.HardwareProfiler{}, | 105 | perfHwProfilers: map[int]*perf.HardwareProfiler{}, |
51 | perfSwProfilers: map[int]perf.SoftwareProfiler{}, | 106 | perfSwProfilers: map[int]*perf.SoftwareProfiler{}, |
52 | perfCacheProfilers: map[int]perf.CacheProfiler{}, | 107 | perfCacheProfilers: map[int]*perf.CacheProfiler{}, |
53 | logger: logger, | 108 | hwProfilerCPUMap: map[*perf.HardwareProfiler]int{}, |
109 | swProfilerCPUMap: map[*perf.SoftwareProfiler]int{}, | ||
110 | cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, | ||
111 | logger: logger, | ||
54 | } | 112 | } |
55 | ncpus := runtime.NumCPU() | 113 | |
56 | for i := 0; i < ncpus; i++ { | 114 | if perfCPUsFlag != nil && *perfCPUsFlag != "" { |
57 | // Use -1 to profile all processes on the CPU, see: | 115 | cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag) |
58 | // man perf_event_open | 116 | if err != nil { |
59 | c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) | 117 | return nil, err |
60 | if err := c.perfHwProfilers[i].Start(); err != nil { | 118 | } |
61 | return c, err | 119 | for _, cpu := range cpus { |
62 | } | 120 | // Use -1 to profile all processes on the CPU, see: |
63 | c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) | 121 | // man perf_event_open |
64 | if err := c.perfSwProfilers[i].Start(); err != nil { | 122 | hwProf := perf.NewHardwareProfiler(-1, cpu) |
65 | return c, err | 123 | if err := hwProf.Start(); err != nil { |
66 | } | 124 | return nil, err |
67 | c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) | 125 | } |
68 | if err := c.perfCacheProfilers[i].Start(); err != nil { | 126 | collector.perfHwProfilers[cpu] = &hwProf |
69 | return c, err | 127 | |
128 | swProf := perf.NewSoftwareProfiler(-1, cpu) | ||
129 | if err := swProf.Start(); err != nil { | ||
130 | return nil, err | ||
131 | } | ||
132 | collector.perfSwProfilers[cpu] = &swProf | ||
133 | |||
134 | cacheProf := perf.NewCacheProfiler(-1, cpu) | ||
135 | if err := cacheProf.Start(); err != nil { | ||
136 | return nil, err | ||
137 | } | ||
138 | collector.perfCacheProfilers[cpu] = &cacheProf | ||
139 | } | ||
140 | } else { | ||
141 | for i := 0; i < runtime.NumCPU(); i++ { | ||
142 | hwProf := perf.NewHardwareProfiler(-1, i) | ||
143 | if err := hwProf.Start(); err != nil { | ||
144 | return nil, err | ||
145 | } | ||
146 | collector.perfHwProfilers[i] = &hwProf | ||
147 | |||
148 | swProf := perf.NewSoftwareProfiler(-1, i) | ||
149 | if err := swProf.Start(); err != nil { | ||
150 | return nil, err | ||
151 | } | ||
152 | collector.perfSwProfilers[i] = &swProf | ||
153 | |||
154 | cacheProf := perf.NewCacheProfiler(-1, i) | ||
155 | if err := cacheProf.Start(); err != nil { | ||
156 | return nil, err | ||
157 | } | ||
158 | collector.perfCacheProfilers[i] = &cacheProf | ||
70 | } | 159 | } |
71 | } | 160 | } |
72 | c.desc = map[string]*prometheus.Desc{ | 161 | |
162 | collector.desc = map[string]*prometheus.Desc{ | ||
73 | "cpucycles_total": prometheus.NewDesc( | 163 | "cpucycles_total": prometheus.NewDesc( |
74 | prometheus.BuildFQName( | 164 | prometheus.BuildFQName( |
75 | namespace, | 165 | namespace, |
@@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) { | |||
312 | ), | 402 | ), |
313 | } | 403 | } |
314 | 404 | ||
315 | return c, nil | 405 | return collector, nil |
316 | } | 406 | } |
317 | 407 | ||
318 | // Update implements the Collector interface and will collect metrics per CPU. | 408 | // Update implements the Collector interface and will collect metrics per CPU. |
@@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { | |||
333 | } | 423 | } |
334 | 424 | ||
335 | func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { | 425 | func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { |
336 | for cpu, profiler := range c.perfHwProfilers { | 426 | for _, profiler := range c.perfHwProfilers { |
337 | cpuStr := strconv.Itoa(cpu) | 427 | cpuid := c.hwProfilerCPUMap[profiler] |
338 | hwProfile, err := profiler.Profile() | 428 | cpuStr := fmt.Sprintf("%d", cpuid) |
429 | hwProfile, err := (*profiler).Profile() | ||
339 | if err != nil { | 430 | if err != nil { |
340 | return err | 431 | return err |
341 | } | 432 | } |
@@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { | |||
404 | } | 495 | } |
405 | 496 | ||
406 | func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { | 497 | func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { |
407 | for cpu, profiler := range c.perfSwProfilers { | 498 | for _, profiler := range c.perfSwProfilers { |
408 | cpuStr := strconv.Itoa(cpu) | 499 | cpuid := c.swProfilerCPUMap[profiler] |
409 | swProfile, err := profiler.Profile() | 500 | cpuStr := fmt.Sprintf("%d", cpuid) |
501 | swProfile, err := (*profiler).Profile() | ||
410 | if err != nil { | 502 | if err != nil { |
411 | return err | 503 | return err |
412 | } | 504 | } |
@@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { | |||
459 | } | 551 | } |
460 | 552 | ||
461 | func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { | 553 | func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { |
462 | for cpu, profiler := range c.perfCacheProfilers { | 554 | for _, profiler := range c.perfCacheProfilers { |
463 | cpuStr := strconv.Itoa(cpu) | 555 | cpuid := c.cacheProfilerCPUMap[profiler] |
464 | cacheProfile, err := profiler.Profile() | 556 | cpuStr := fmt.Sprintf("%d", cpuid) |
557 | cacheProfile, err := (*profiler).Profile() | ||
465 | if err != nil { | 558 | if err != nil { |
466 | return err | 559 | return err |
467 | } | 560 | } |
diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go index 68c580b..eecfab9 100644 --- a/collector/perf_linux_test.go +++ b/collector/perf_linux_test.go | |||
@@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) { | |||
54 | t.Fatal(err) | 54 | t.Fatal(err) |
55 | } | 55 | } |
56 | } | 56 | } |
57 | |||
58 | func TestPerfCPUFlagToCPUs(t *testing.T) { | ||
59 | tests := []struct { | ||
60 | name string | ||
61 | flag string | ||
62 | exCpus []int | ||
63 | errStr string | ||
64 | }{ | ||
65 | { | ||
66 | name: "valid single cpu", | ||
67 | flag: "1", | ||
68 | exCpus: []int{1}, | ||
69 | }, | ||
70 | { | ||
71 | name: "valid range cpus", | ||
72 | flag: "1-5", | ||
73 | exCpus: []int{1, 2, 3, 4, 5}, | ||
74 | }, | ||
75 | { | ||
76 | name: "valid double digit", | ||
77 | flag: "10", | ||
78 | exCpus: []int{10}, | ||
79 | }, | ||
80 | { | ||
81 | name: "valid double digit range", | ||
82 | flag: "10-12", | ||
83 | exCpus: []int{10, 11, 12}, | ||
84 | }, | ||
85 | { | ||
86 | name: "valid double digit stride", | ||
87 | flag: "10-20:5", | ||
88 | exCpus: []int{10, 15, 20}, | ||
89 | }, | ||
90 | } | ||
91 | |||
92 | for _, test := range tests { | ||
93 | t.Run(test.name, func(t *testing.T) { | ||
94 | cpus, err := perfCPUFlagToCPUs(test.flag) | ||
95 | if test.errStr != "" { | ||
96 | if err != nil { | ||
97 | t.Fatal("expected error to not be nil") | ||
98 | } | ||
99 | if test.errStr != err.Error() { | ||
100 | t.Fatalf( | ||
101 | "expected error %q, got %q", | ||
102 | test.errStr, | ||
103 | err.Error(), | ||
104 | ) | ||
105 | } | ||
106 | return | ||
107 | } | ||
108 | if err != nil { | ||
109 | t.Fatal(err) | ||
110 | } | ||
111 | if len(cpus) != len(test.exCpus) { | ||
112 | t.Fatalf( | ||
113 | "expected cpus %v, got %v", | ||
114 | test.exCpus, | ||
115 | cpus, | ||
116 | ) | ||
117 | } | ||
118 | for i := range cpus { | ||
119 | if test.exCpus[i] != cpus[i] { | ||
120 | t.Fatalf( | ||
121 | "expected cpus %v, got %v", | ||
122 | test.exCpus, | ||
123 | cpus, | ||
124 | ) | ||
125 | } | ||
126 | } | ||
127 | }) | ||
128 | } | ||
129 | } | ||