aboutsummaryrefslogtreecommitdiff
path: root/collector
diff options
context:
space:
mode:
authorDaniel Hodges <hodges@uber.com>2020-02-20 05:36:33 -0500
committerGitHub <noreply@github.com>2020-02-20 11:36:33 +0100
commitec6214138801d80045e916479a1f8297c40de8e3 (patch)
treec71787ad5060284be50ebce5e6445d587304ddeb /collector
parentb40954dce598577413d93ce32cc005b57c6371bb (diff)
downloadprometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.tar.bz2
prometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.tar.xz
prometheus_node_collector-ec6214138801d80045e916479a1f8297c40de8e3.zip
Fix num cpu (#1561)
* add a map of profilers to CPUids `runtime.NumCPU()` returns the number of CPUs that the process can run on. This number does not necessarily correlate to CPU ids if the affinity mask of the process is set. This change maintains the current behavior as default, but also allows the user to specify a range of CPUids to use instead. The CPU id is stored as the value of a map keyed on the profiler object's address. Signed-off-by: Joe Damato <jdamato@fastly.com> Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com> Signed-off-by: Daniel Hodges <hodges@uber.com> Co-authored-by: jdamato-fsly <55214354+jdamato-fsly@users.noreply.github.com>
Diffstat (limited to 'collector')
-rw-r--r--collector/perf_linux.go165
-rw-r--r--collector/perf_linux_test.go73
2 files changed, 202 insertions, 36 deletions
diff --git a/collector/perf_linux.go b/collector/perf_linux.go
index e8a52b4..b67f970 100644
--- a/collector/perf_linux.go
+++ b/collector/perf_linux.go
@@ -14,18 +14,25 @@
14package collector 14package collector
15 15
16import ( 16import (
17 "fmt"
17 "runtime" 18 "runtime"
18 "strconv" 19 "strconv"
20 "strings"
19 21
20 "github.com/go-kit/kit/log" 22 "github.com/go-kit/kit/log"
21 "github.com/hodgesds/perf-utils" 23 "github.com/hodgesds/perf-utils"
22 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/client_golang/prometheus"
25 kingpin "gopkg.in/alecthomas/kingpin.v2"
23) 26)
24 27
25const ( 28const (
26 perfSubsystem = "perf" 29 perfSubsystem = "perf"
27) 30)
28 31
32var (
33 perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String()
34)
35
29func init() { 36func init() {
30 registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) 37 registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
31} 38}
@@ -36,40 +43,123 @@ func init() {
36// settings not all profiler values may be exposed on the target system at any 43// settings not all profiler values may be exposed on the target system at any
37// given time. 44// given time.
38type perfCollector struct { 45type perfCollector struct {
39 perfHwProfilers map[int]perf.HardwareProfiler 46 hwProfilerCPUMap map[*perf.HardwareProfiler]int
40 perfSwProfilers map[int]perf.SoftwareProfiler 47 swProfilerCPUMap map[*perf.SoftwareProfiler]int
41 perfCacheProfilers map[int]perf.CacheProfiler 48 cacheProfilerCPUMap map[*perf.CacheProfiler]int
42 desc map[string]*prometheus.Desc 49 perfHwProfilers map[int]*perf.HardwareProfiler
43 logger log.Logger 50 perfSwProfilers map[int]*perf.SoftwareProfiler
51 perfCacheProfilers map[int]*perf.CacheProfiler
52 desc map[string]*prometheus.Desc
53 logger log.Logger
54}
55
56// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor.
57func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) {
58 var err error
59 cpus := []int{}
60 for _, subset := range strings.Split(cpuFlag, ",") {
61 // First parse a single CPU.
62 if !strings.Contains(subset, "-") {
63 cpu, err := strconv.Atoi(subset)
64 if err != nil {
65 return nil, err
66 }
67 cpus = append(cpus, cpu)
68 continue
69 }
70
71 stride := 1
72 // Handle strides, ie 1-10:5 should yield 1,5,10
73 strideSet := strings.Split(subset, ":")
74 if len(strideSet) == 2 {
75 stride, err = strconv.Atoi(strideSet[1])
76 if err != nil {
77 return nil, err
78 }
79 }
80
81 rangeSet := strings.Split(strideSet[0], "-")
82 if len(rangeSet) != 2 {
83 return nil, fmt.Errorf("invalid flag value %q", cpuFlag)
84 }
85 start, err := strconv.Atoi(rangeSet[0])
86 if err != nil {
87 return nil, err
88 }
89 end, err := strconv.Atoi(rangeSet[1])
90 if err != nil {
91 return nil, err
92 }
93 for i := start; i <= end; i += stride {
94 cpus = append(cpus, i)
95 }
96 }
97
98 return cpus, nil
44} 99}
45 100
46// NewPerfCollector returns a new perf based collector, it creates a profiler 101// NewPerfCollector returns a new perf based collector, it creates a profiler
47// per CPU. 102// per CPU.
48func NewPerfCollector(logger log.Logger) (Collector, error) { 103func NewPerfCollector(logger log.Logger) (Collector, error) {
49 c := &perfCollector{ 104 collector := &perfCollector{
50 perfHwProfilers: map[int]perf.HardwareProfiler{}, 105 perfHwProfilers: map[int]*perf.HardwareProfiler{},
51 perfSwProfilers: map[int]perf.SoftwareProfiler{}, 106 perfSwProfilers: map[int]*perf.SoftwareProfiler{},
52 perfCacheProfilers: map[int]perf.CacheProfiler{}, 107 perfCacheProfilers: map[int]*perf.CacheProfiler{},
53 logger: logger, 108 hwProfilerCPUMap: map[*perf.HardwareProfiler]int{},
109 swProfilerCPUMap: map[*perf.SoftwareProfiler]int{},
110 cacheProfilerCPUMap: map[*perf.CacheProfiler]int{},
111 logger: logger,
54 } 112 }
55 ncpus := runtime.NumCPU() 113
56 for i := 0; i < ncpus; i++ { 114 if perfCPUsFlag != nil && *perfCPUsFlag != "" {
57 // Use -1 to profile all processes on the CPU, see: 115 cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag)
58 // man perf_event_open 116 if err != nil {
59 c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) 117 return nil, err
60 if err := c.perfHwProfilers[i].Start(); err != nil { 118 }
61 return c, err 119 for _, cpu := range cpus {
62 } 120 // Use -1 to profile all processes on the CPU, see:
63 c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) 121 // man perf_event_open
64 if err := c.perfSwProfilers[i].Start(); err != nil { 122 hwProf := perf.NewHardwareProfiler(-1, cpu)
65 return c, err 123 if err := hwProf.Start(); err != nil {
66 } 124 return nil, err
67 c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) 125 }
68 if err := c.perfCacheProfilers[i].Start(); err != nil { 126 collector.perfHwProfilers[cpu] = &hwProf
69 return c, err 127
128 swProf := perf.NewSoftwareProfiler(-1, cpu)
129 if err := swProf.Start(); err != nil {
130 return nil, err
131 }
132 collector.perfSwProfilers[cpu] = &swProf
133
134 cacheProf := perf.NewCacheProfiler(-1, cpu)
135 if err := cacheProf.Start(); err != nil {
136 return nil, err
137 }
138 collector.perfCacheProfilers[cpu] = &cacheProf
139 }
140 } else {
141 for i := 0; i < runtime.NumCPU(); i++ {
142 hwProf := perf.NewHardwareProfiler(-1, i)
143 if err := hwProf.Start(); err != nil {
144 return nil, err
145 }
146 collector.perfHwProfilers[i] = &hwProf
147
148 swProf := perf.NewSoftwareProfiler(-1, i)
149 if err := swProf.Start(); err != nil {
150 return nil, err
151 }
152 collector.perfSwProfilers[i] = &swProf
153
154 cacheProf := perf.NewCacheProfiler(-1, i)
155 if err := cacheProf.Start(); err != nil {
156 return nil, err
157 }
158 collector.perfCacheProfilers[i] = &cacheProf
70 } 159 }
71 } 160 }
72 c.desc = map[string]*prometheus.Desc{ 161
162 collector.desc = map[string]*prometheus.Desc{
73 "cpucycles_total": prometheus.NewDesc( 163 "cpucycles_total": prometheus.NewDesc(
74 prometheus.BuildFQName( 164 prometheus.BuildFQName(
75 namespace, 165 namespace,
@@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) {
312 ), 402 ),
313 } 403 }
314 404
315 return c, nil 405 return collector, nil
316} 406}
317 407
318// Update implements the Collector interface and will collect metrics per CPU. 408// Update implements the Collector interface and will collect metrics per CPU.
@@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
333} 423}
334 424
335func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { 425func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
336 for cpu, profiler := range c.perfHwProfilers { 426 for _, profiler := range c.perfHwProfilers {
337 cpuStr := strconv.Itoa(cpu) 427 cpuid := c.hwProfilerCPUMap[profiler]
338 hwProfile, err := profiler.Profile() 428 cpuStr := fmt.Sprintf("%d", cpuid)
429 hwProfile, err := (*profiler).Profile()
339 if err != nil { 430 if err != nil {
340 return err 431 return err
341 } 432 }
@@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
404} 495}
405 496
406func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { 497func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
407 for cpu, profiler := range c.perfSwProfilers { 498 for _, profiler := range c.perfSwProfilers {
408 cpuStr := strconv.Itoa(cpu) 499 cpuid := c.swProfilerCPUMap[profiler]
409 swProfile, err := profiler.Profile() 500 cpuStr := fmt.Sprintf("%d", cpuid)
501 swProfile, err := (*profiler).Profile()
410 if err != nil { 502 if err != nil {
411 return err 503 return err
412 } 504 }
@@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
459} 551}
460 552
461func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { 553func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error {
462 for cpu, profiler := range c.perfCacheProfilers { 554 for _, profiler := range c.perfCacheProfilers {
463 cpuStr := strconv.Itoa(cpu) 555 cpuid := c.cacheProfilerCPUMap[profiler]
464 cacheProfile, err := profiler.Profile() 556 cpuStr := fmt.Sprintf("%d", cpuid)
557 cacheProfile, err := (*profiler).Profile()
465 if err != nil { 558 if err != nil {
466 return err 559 return err
467 } 560 }
diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go
index 68c580b..eecfab9 100644
--- a/collector/perf_linux_test.go
+++ b/collector/perf_linux_test.go
@@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) {
54 t.Fatal(err) 54 t.Fatal(err)
55 } 55 }
56} 56}
57
58func TestPerfCPUFlagToCPUs(t *testing.T) {
59 tests := []struct {
60 name string
61 flag string
62 exCpus []int
63 errStr string
64 }{
65 {
66 name: "valid single cpu",
67 flag: "1",
68 exCpus: []int{1},
69 },
70 {
71 name: "valid range cpus",
72 flag: "1-5",
73 exCpus: []int{1, 2, 3, 4, 5},
74 },
75 {
76 name: "valid double digit",
77 flag: "10",
78 exCpus: []int{10},
79 },
80 {
81 name: "valid double digit range",
82 flag: "10-12",
83 exCpus: []int{10, 11, 12},
84 },
85 {
86 name: "valid double digit stride",
87 flag: "10-20:5",
88 exCpus: []int{10, 15, 20},
89 },
90 }
91
92 for _, test := range tests {
93 t.Run(test.name, func(t *testing.T) {
94 cpus, err := perfCPUFlagToCPUs(test.flag)
95 if test.errStr != "" {
96 if err != nil {
97 t.Fatal("expected error to not be nil")
98 }
99 if test.errStr != err.Error() {
100 t.Fatalf(
101 "expected error %q, got %q",
102 test.errStr,
103 err.Error(),
104 )
105 }
106 return
107 }
108 if err != nil {
109 t.Fatal(err)
110 }
111 if len(cpus) != len(test.exCpus) {
112 t.Fatalf(
113 "expected cpus %v, got %v",
114 test.exCpus,
115 cpus,
116 )
117 }
118 for i := range cpus {
119 if test.exCpus[i] != cpus[i] {
120 t.Fatalf(
121 "expected cpus %v, got %v",
122 test.exCpus,
123 cpus,
124 )
125 }
126 }
127 })
128 }
129}