diff options
author | Daniel Hodges <hodges@uber.com> | 2020-02-20 05:36:33 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-20 11:36:33 +0100 |
commit | ec6214138801d80045e916479a1f8297c40de8e3 (patch) | |
tree | c71787ad5060284be50ebce5e6445d587304ddeb /collector | |
parent | b40954dce598577413d93ce32cc005b57c6371bb (diff) | |
download | prometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.tar.bz2 prometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.tar.xz prometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.zip |
Fix num cpu (#1561)
* add a map of profilers to CPUids
`runtime.NumCPU()` returns the number of CPUs that the process can run
on. This number does not necessarily correlate to CPU ids if the
affinity mask of the process is set.
This change maintains the current behavior as default, but also allows
the user to specify a range of CPUids to use instead.
The CPU id is stored as the value of a map keyed on the profiler
object's address.
Signed-off-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
Signed-off-by: Daniel Hodges <hodges@uber.com>
Co-authored-by: jdamato-fsly <55214354+jdamato-fsly@users.noreply.github.com>
Diffstat (limited to 'collector')
-rw-r--r-- | collector/perf_linux.go | 165 | ||||
-rw-r--r-- | collector/perf_linux_test.go | 73 |
2 files changed, 202 insertions, 36 deletions
diff --git a/collector/perf_linux.go b/collector/perf_linux.go index e8a52b4..b67f970 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go | |||
@@ -14,18 +14,25 @@ | |||
14 | package collector | 14 | package collector |
15 | 15 | ||
16 | import ( | 16 | import ( |
17 | "fmt" | ||
17 | "runtime" | 18 | "runtime" |
18 | "strconv" | 19 | "strconv" |
20 | "strings" | ||
19 | 21 | ||
20 | "github.com/go-kit/kit/log" | 22 | "github.com/go-kit/kit/log" |
21 | "github.com/hodgesds/perf-utils" | 23 | "github.com/hodgesds/perf-utils" |
22 | "github.com/prometheus/client_golang/prometheus" | 24 | "github.com/prometheus/client_golang/prometheus" |
25 | kingpin "gopkg.in/alecthomas/kingpin.v2" | ||
23 | ) | 26 | ) |
24 | 27 | ||
25 | const ( | 28 | const ( |
26 | perfSubsystem = "perf" | 29 | perfSubsystem = "perf" |
27 | ) | 30 | ) |
28 | 31 | ||
32 | var ( | ||
33 | perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() | ||
34 | ) | ||
35 | |||
29 | func init() { | 36 | func init() { |
30 | registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) | 37 | registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) |
31 | } | 38 | } |
@@ -36,40 +43,123 @@ func init() { | |||
36 | // settings not all profiler values may be exposed on the target system at any | 43 | // settings not all profiler values may be exposed on the target system at any |
37 | // given time. | 44 | // given time. |
38 | type perfCollector struct { | 45 | type perfCollector struct { |
39 | perfHwProfilers map[int]perf.HardwareProfiler | 46 | hwProfilerCPUMap map[*perf.HardwareProfiler]int |
40 | perfSwProfilers map[int]perf.SoftwareProfiler | 47 | swProfilerCPUMap map[*perf.SoftwareProfiler]int |
41 | perfCacheProfilers map[int]perf.CacheProfiler | 48 | cacheProfilerCPUMap map[*perf.CacheProfiler]int |
42 | desc map[string]*prometheus.Desc | 49 | perfHwProfilers map[int]*perf.HardwareProfiler |
43 | logger log.Logger | 50 | perfSwProfilers map[int]*perf.SoftwareProfiler |
51 | perfCacheProfilers map[int]*perf.CacheProfiler | ||
52 | desc map[string]*prometheus.Desc | ||
53 | logger log.Logger | ||
54 | } | ||
55 | |||
56 | // perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. | ||
57 | func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) { | ||
58 | var err error | ||
59 | cpus := []int{} | ||
60 | for _, subset := range strings.Split(cpuFlag, ",") { | ||
61 | // First parse a single CPU. | ||
62 | if !strings.Contains(subset, "-") { | ||
63 | cpu, err := strconv.Atoi(subset) | ||
64 | if err != nil { | ||
65 | return nil, err | ||
66 | } | ||
67 | cpus = append(cpus, cpu) | ||
68 | continue | ||
69 | } | ||
70 | |||
71 | stride := 1 | ||
72 | // Handle strides, ie 1-10:5 should yield 1,5,10 | ||
73 | strideSet := strings.Split(subset, ":") | ||
74 | if len(strideSet) == 2 { | ||
75 | stride, err = strconv.Atoi(strideSet[1]) | ||
76 | if err != nil { | ||
77 | return nil, err | ||
78 | } | ||
79 | } | ||
80 | |||
81 | rangeSet := strings.Split(strideSet[0], "-") | ||
82 | if len(rangeSet) != 2 { | ||
83 | return nil, fmt.Errorf("invalid flag value %q", cpuFlag) | ||
84 | } | ||
85 | start, err := strconv.Atoi(rangeSet[0]) | ||
86 | if err != nil { | ||
87 | return nil, err | ||
88 | } | ||
89 | end, err := strconv.Atoi(rangeSet[1]) | ||
90 | if err != nil { | ||
91 | return nil, err | ||
92 | } | ||
93 | for i := start; i <= end; i += stride { | ||
94 | cpus = append(cpus, i) | ||
95 | } | ||
96 | } | ||
97 | |||
98 | return cpus, nil | ||
44 | } | 99 | } |
45 | 100 | ||
46 | // NewPerfCollector returns a new perf based collector, it creates a profiler | 101 | // NewPerfCollector returns a new perf based collector, it creates a profiler |
47 | // per CPU. | 102 | // per CPU. |
48 | func NewPerfCollector(logger log.Logger) (Collector, error) { | 103 | func NewPerfCollector(logger log.Logger) (Collector, error) { |
49 | c := &perfCollector{ | 104 | collector := &perfCollector{ |
50 | perfHwProfilers: map[int]perf.HardwareProfiler{}, | 105 | perfHwProfilers: map[int]*perf.HardwareProfiler{}, |
51 | perfSwProfilers: map[int]perf.SoftwareProfiler{}, | 106 | perfSwProfilers: map[int]*perf.SoftwareProfiler{}, |
52 | perfCacheProfilers: map[int]perf.CacheProfiler{}, | 107 | perfCacheProfilers: map[int]*perf.CacheProfiler{}, |
53 | logger: logger, | 108 | hwProfilerCPUMap: map[*perf.HardwareProfiler]int{}, |
109 | swProfilerCPUMap: map[*perf.SoftwareProfiler]int{}, | ||
110 | cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, | ||
111 | logger: logger, | ||
54 | } | 112 | } |
55 | ncpus := runtime.NumCPU() | 113 | |
56 | for i := 0; i < ncpus; i++ { | 114 | if perfCPUsFlag != nil && *perfCPUsFlag != "" { |
57 | // Use -1 to profile all processes on the CPU, see: | 115 | cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag) |
58 | // man perf_event_open | 116 | if err != nil { |
59 | c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) | 117 | return nil, err |
60 | if err := c.perfHwProfilers[i].Start(); err != nil { | 118 | } |
61 | return c, err | 119 | for _, cpu := range cpus { |
62 | } | 120 | // Use -1 to profile all processes on the CPU, see: |
63 | c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) | 121 | // man perf_event_open |
64 | if err := c.perfSwProfilers[i].Start(); err != nil { | 122 | hwProf := perf.NewHardwareProfiler(-1, cpu) |
65 | return c, err | 123 | if err := hwProf.Start(); err != nil { |
66 | } | 124 | return nil, err |
67 | c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) | 125 | } |
68 | if err := c.perfCacheProfilers[i].Start(); err != nil { | 126 | collector.perfHwProfilers[cpu] = &hwProf |
69 | return c, err | 127 | |
128 | swProf := perf.NewSoftwareProfiler(-1, cpu) | ||
129 | if err := swProf.Start(); err != nil { | ||
130 | return nil, err | ||
131 | } | ||
132 | collector.perfSwProfilers[cpu] = &swProf | ||
133 | |||
134 | cacheProf := perf.NewCacheProfiler(-1, cpu) | ||
135 | if err := cacheProf.Start(); err != nil { | ||
136 | return nil, err | ||
137 | } | ||
138 | collector.perfCacheProfilers[cpu] = &cacheProf | ||
139 | } | ||
140 | } else { | ||
141 | for i := 0; i < runtime.NumCPU(); i++ { | ||
142 | hwProf := perf.NewHardwareProfiler(-1, i) | ||
143 | if err := hwProf.Start(); err != nil { | ||
144 | return nil, err | ||
145 | } | ||
146 | collector.perfHwProfilers[i] = &hwProf | ||
147 | |||
148 | swProf := perf.NewSoftwareProfiler(-1, i) | ||
149 | if err := swProf.Start(); err != nil { | ||
150 | return nil, err | ||
151 | } | ||
152 | collector.perfSwProfilers[i] = &swProf | ||
153 | |||
154 | cacheProf := perf.NewCacheProfiler(-1, i) | ||
155 | if err := cacheProf.Start(); err != nil { | ||
156 | return nil, err | ||
157 | } | ||
158 | collector.perfCacheProfilers[i] = &cacheProf | ||
70 | } | 159 | } |
71 | } | 160 | } |
72 | c.desc = map[string]*prometheus.Desc{ | 161 | |
162 | collector.desc = map[string]*prometheus.Desc{ | ||
73 | "cpucycles_total": prometheus.NewDesc( | 163 | "cpucycles_total": prometheus.NewDesc( |
74 | prometheus.BuildFQName( | 164 | prometheus.BuildFQName( |
75 | namespace, | 165 | namespace, |
@@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) { | |||
312 | ), | 402 | ), |
313 | } | 403 | } |
314 | 404 | ||
315 | return c, nil | 405 | return collector, nil |
316 | } | 406 | } |
317 | 407 | ||
318 | // Update implements the Collector interface and will collect metrics per CPU. | 408 | // Update implements the Collector interface and will collect metrics per CPU. |
@@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { | |||
333 | } | 423 | } |
334 | 424 | ||
335 | func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { | 425 | func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { |
336 | for cpu, profiler := range c.perfHwProfilers { | 426 | for _, profiler := range c.perfHwProfilers { |
337 | cpuStr := strconv.Itoa(cpu) | 427 | cpuid := c.hwProfilerCPUMap[profiler] |
338 | hwProfile, err := profiler.Profile() | 428 | cpuStr := fmt.Sprintf("%d", cpuid) |
429 | hwProfile, err := (*profiler).Profile() | ||
339 | if err != nil { | 430 | if err != nil { |
340 | return err | 431 | return err |
341 | } | 432 | } |
@@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { | |||
404 | } | 495 | } |
405 | 496 | ||
406 | func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { | 497 | func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { |
407 | for cpu, profiler := range c.perfSwProfilers { | 498 | for _, profiler := range c.perfSwProfilers { |
408 | cpuStr := strconv.Itoa(cpu) | 499 | cpuid := c.swProfilerCPUMap[profiler] |
409 | swProfile, err := profiler.Profile() | 500 | cpuStr := fmt.Sprintf("%d", cpuid) |
501 | swProfile, err := (*profiler).Profile() | ||
410 | if err != nil { | 502 | if err != nil { |
411 | return err | 503 | return err |
412 | } | 504 | } |
@@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { | |||
459 | } | 551 | } |
460 | 552 | ||
461 | func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { | 553 | func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { |
462 | for cpu, profiler := range c.perfCacheProfilers { | 554 | for _, profiler := range c.perfCacheProfilers { |
463 | cpuStr := strconv.Itoa(cpu) | 555 | cpuid := c.cacheProfilerCPUMap[profiler] |
464 | cacheProfile, err := profiler.Profile() | 556 | cpuStr := fmt.Sprintf("%d", cpuid) |
557 | cacheProfile, err := (*profiler).Profile() | ||
465 | if err != nil { | 558 | if err != nil { |
466 | return err | 559 | return err |
467 | } | 560 | } |
diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go index 68c580b..eecfab9 100644 --- a/collector/perf_linux_test.go +++ b/collector/perf_linux_test.go | |||
@@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) { | |||
54 | t.Fatal(err) | 54 | t.Fatal(err) |
55 | } | 55 | } |
56 | } | 56 | } |
57 | |||
58 | func TestPerfCPUFlagToCPUs(t *testing.T) { | ||
59 | tests := []struct { | ||
60 | name string | ||
61 | flag string | ||
62 | exCpus []int | ||
63 | errStr string | ||
64 | }{ | ||
65 | { | ||
66 | name: "valid single cpu", | ||
67 | flag: "1", | ||
68 | exCpus: []int{1}, | ||
69 | }, | ||
70 | { | ||
71 | name: "valid range cpus", | ||
72 | flag: "1-5", | ||
73 | exCpus: []int{1, 2, 3, 4, 5}, | ||
74 | }, | ||
75 | { | ||
76 | name: "valid double digit", | ||
77 | flag: "10", | ||
78 | exCpus: []int{10}, | ||
79 | }, | ||
80 | { | ||
81 | name: "valid double digit range", | ||
82 | flag: "10-12", | ||
83 | exCpus: []int{10, 11, 12}, | ||
84 | }, | ||
85 | { | ||
86 | name: "valid double digit stride", | ||
87 | flag: "10-20:5", | ||
88 | exCpus: []int{10, 15, 20}, | ||
89 | }, | ||
90 | } | ||
91 | |||
92 | for _, test := range tests { | ||
93 | t.Run(test.name, func(t *testing.T) { | ||
94 | cpus, err := perfCPUFlagToCPUs(test.flag) | ||
95 | if test.errStr != "" { | ||
96 | if err != nil { | ||
97 | t.Fatal("expected error to not be nil") | ||
98 | } | ||
99 | if test.errStr != err.Error() { | ||
100 | t.Fatalf( | ||
101 | "expected error %q, got %q", | ||
102 | test.errStr, | ||
103 | err.Error(), | ||
104 | ) | ||
105 | } | ||
106 | return | ||
107 | } | ||
108 | if err != nil { | ||
109 | t.Fatal(err) | ||
110 | } | ||
111 | if len(cpus) != len(test.exCpus) { | ||
112 | t.Fatalf( | ||
113 | "expected cpus %v, got %v", | ||
114 | test.exCpus, | ||
115 | cpus, | ||
116 | ) | ||
117 | } | ||
118 | for i := range cpus { | ||
119 | if test.exCpus[i] != cpus[i] { | ||
120 | t.Fatalf( | ||
121 | "expected cpus %v, got %v", | ||
122 | test.exCpus, | ||
123 | cpus, | ||
124 | ) | ||
125 | } | ||
126 | } | ||
127 | }) | ||
128 | } | ||
129 | } | ||