diff options
author | beorn7 <beorn@grafana.com> | 2019-07-05 19:38:03 +0200 |
---|---|---|
committer | beorn7 <beorn@grafana.com> | 2019-07-05 19:38:03 +0200 |
commit | 2df034c05512628fc1946f5031773790b644abfc (patch) | |
tree | 89aba5d892678984d434cdb6366c27df9935c1a8 /docs | |
parent | 61bcc5b4681230e07b96dbb49a9a7f5e301062bf (diff) | |
download | prometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.tar.bz2 prometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.tar.xz prometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.zip |
Move node-mixin into docs directory
Signed-off-by: beorn7 <beorn@grafana.com>
Diffstat (limited to 'docs')
-rw-r--r-- | docs/node-mixin/.gitignore | 3 | ||||
-rw-r--r-- | docs/node-mixin/alerts/alerts.libsonnet | 165 | ||||
-rw-r--r-- | docs/node-mixin/config.libsonnet | 11 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/dashboards.libsonnet | 2 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/node.libsonnet | 170 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/use.libsonnet | 151 | ||||
-rw-r--r-- | docs/node-mixin/jsonnetfile.json | 24 | ||||
-rw-r--r-- | docs/node-mixin/lib/promgrafonnet/gauge.libsonnet | 60 | ||||
-rw-r--r-- | docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet | 48 | ||||
-rw-r--r-- | docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet | 5 | ||||
-rw-r--r-- | docs/node-mixin/mixin.libsonnet | 4 | ||||
-rw-r--r-- | docs/node-mixin/rules/rules.libsonnet | 106 |
12 files changed, 749 insertions, 0 deletions
diff --git a/docs/node-mixin/.gitignore b/docs/node-mixin/.gitignore new file mode 100644 index 0000000..65d141b --- /dev/null +++ b/docs/node-mixin/.gitignore | |||
@@ -0,0 +1,3 @@ | |||
1 | /jsonnetfile.lock.json | ||
2 | /vendor/ | ||
3 | |||
diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 0000000..8ea70cc --- /dev/null +++ b/docs/node-mixin/alerts/alerts.libsonnet | |||
@@ -0,0 +1,165 @@ | |||
1 | { | ||
2 | prometheusAlerts+:: { | ||
3 | groups+: [ | ||
4 | { | ||
5 | name: 'node-exporter', | ||
6 | rules: [ | ||
7 | { | ||
8 | alert: 'NodeFilesystemSpaceFillingUp', | ||
9 | expr: ||| | ||
10 | predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 | ||
11 | and | ||
12 | node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 | ||
13 | and | ||
14 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
15 | ||| % $._config, | ||
16 | 'for': '1h', | ||
17 | labels: { | ||
18 | severity: 'warning', | ||
19 | }, | ||
20 | annotations: { | ||
21 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', | ||
22 | }, | ||
23 | }, | ||
24 | { | ||
25 | alert: 'NodeFilesystemSpaceFillingUp', | ||
26 | expr: ||| | ||
27 | predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 | ||
28 | and | ||
29 | node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 | ||
30 | and | ||
31 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
32 | ||| % $._config, | ||
33 | 'for': '1h', | ||
34 | labels: { | ||
35 | severity: 'critical', | ||
36 | }, | ||
37 | annotations: { | ||
38 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', | ||
39 | }, | ||
40 | }, | ||
41 | { | ||
42 | alert: 'NodeFilesystemOutOfSpace', | ||
43 | expr: ||| | ||
44 | node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 | ||
45 | and | ||
46 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
47 | ||| % $._config, | ||
48 | 'for': '1h', | ||
49 | labels: { | ||
50 | severity: 'warning', | ||
51 | }, | ||
52 | annotations: { | ||
53 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', | ||
54 | }, | ||
55 | }, | ||
56 | { | ||
57 | alert: 'NodeFilesystemOutOfSpace', | ||
58 | expr: ||| | ||
59 | node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 | ||
60 | and | ||
61 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
62 | ||| % $._config, | ||
63 | 'for': '1h', | ||
64 | labels: { | ||
65 | severity: 'critical', | ||
66 | }, | ||
67 | annotations: { | ||
68 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', | ||
69 | }, | ||
70 | }, | ||
71 | { | ||
72 | alert: 'NodeFilesystemFilesFillingUp', | ||
73 | expr: ||| | ||
74 | predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 | ||
75 | and | ||
76 | node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 | ||
77 | and | ||
78 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
79 | ||| % $._config, | ||
80 | 'for': '1h', | ||
81 | labels: { | ||
82 | severity: 'warning', | ||
83 | }, | ||
84 | annotations: { | ||
85 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', | ||
86 | }, | ||
87 | }, | ||
88 | { | ||
89 | alert: 'NodeFilesystemFilesFillingUp', | ||
90 | expr: ||| | ||
91 | predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 | ||
92 | and | ||
93 | node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 | ||
94 | and | ||
95 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
96 | ||| % $._config, | ||
97 | 'for': '1h', | ||
98 | labels: { | ||
99 | severity: 'critical', | ||
100 | }, | ||
101 | annotations: { | ||
102 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', | ||
103 | }, | ||
104 | }, | ||
105 | { | ||
106 | alert: 'NodeFilesystemOutOfFiles', | ||
107 | expr: ||| | ||
108 | node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 | ||
109 | and | ||
110 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
111 | ||| % $._config, | ||
112 | 'for': '1h', | ||
113 | labels: { | ||
114 | severity: 'warning', | ||
115 | }, | ||
116 | annotations: { | ||
117 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', | ||
118 | }, | ||
119 | }, | ||
120 | { | ||
121 | alert: 'NodeFilesystemOutOfSpace', | ||
122 | expr: ||| | ||
123 | node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 | ||
124 | and | ||
125 | node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 | ||
126 | ||| % $._config, | ||
127 | 'for': '1h', | ||
128 | labels: { | ||
129 | severity: 'critical', | ||
130 | }, | ||
131 | annotations: { | ||
132 | message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', | ||
133 | }, | ||
134 | }, | ||
135 | { | ||
136 | alert: 'NodeNetworkReceiveErrs', | ||
137 | expr: ||| | ||
138 | increase(node_network_receive_errs_total[2m]) > 10 | ||
139 | ||| % $._config, | ||
140 | 'for': '1h', | ||
141 | labels: { | ||
142 | severity: 'critical', | ||
143 | }, | ||
144 | annotations: { | ||
145 | message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', | ||
146 | }, | ||
147 | }, | ||
148 | { | ||
149 | alert: 'NodeNetworkTransmitErrs', | ||
150 | expr: ||| | ||
151 | increase(node_network_transmit_errs_total[2m]) > 10 | ||
152 | ||| % $._config, | ||
153 | 'for': '1h', | ||
154 | labels: { | ||
155 | severity: 'critical', | ||
156 | }, | ||
157 | annotations: { | ||
158 | message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', | ||
159 | }, | ||
160 | }, | ||
161 | ], | ||
162 | }, | ||
163 | ], | ||
164 | }, | ||
165 | } | ||
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet new file mode 100644 index 0000000..6c5d6f7 --- /dev/null +++ b/docs/node-mixin/config.libsonnet | |||
@@ -0,0 +1,11 @@ | |||
1 | { | ||
2 | _config+:: { | ||
3 | // Selectors are inserted between {} in Prometheus queries. | ||
4 | nodeExporterSelector: 'job="node-exporter"', | ||
5 | |||
6 | // Mainly extracted because they are repetitive, but also useful to customize. | ||
7 | fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', | ||
8 | |||
9 | grafana_prefix: '', | ||
10 | }, | ||
11 | } | ||
diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 0000000..e6adbd4 --- /dev/null +++ b/docs/node-mixin/dashboards/dashboards.libsonnet | |||
@@ -0,0 +1,2 @@ | |||
1 | (import 'node.libsonnet') + | ||
2 | (import 'use.libsonnet') | ||
diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet new file mode 100644 index 0000000..4594e3e --- /dev/null +++ b/docs/node-mixin/dashboards/node.libsonnet | |||
@@ -0,0 +1,170 @@ | |||
1 | local grafana = import 'grafonnet/grafana.libsonnet'; | ||
2 | local dashboard = grafana.dashboard; | ||
3 | local row = grafana.row; | ||
4 | local prometheus = grafana.prometheus; | ||
5 | local template = grafana.template; | ||
6 | local graphPanel = grafana.graphPanel; | ||
7 | local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; | ||
8 | local gauge = promgrafonnet.gauge; | ||
9 | |||
10 | { | ||
11 | grafanaDashboards+:: { | ||
12 | 'nodes.json': | ||
13 | local idleCPU = | ||
14 | graphPanel.new( | ||
15 | 'Idle CPU', | ||
16 | datasource='$datasource', | ||
17 | span=6, | ||
18 | format='percentunit', | ||
19 | max=100, | ||
20 | min=0, | ||
21 | ) | ||
22 | .addTarget(prometheus.target( | ||
23 | ||| | ||
24 | 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) | ||
25 | ||| % $._config, | ||
26 | legendFormat='{{cpu}}', | ||
27 | intervalFactor=10, | ||
28 | )); | ||
29 | |||
30 | local systemLoad = | ||
31 | graphPanel.new( | ||
32 | 'System load', | ||
33 | datasource='$datasource', | ||
34 | span=6, | ||
35 | format='percentunit', | ||
36 | ) | ||
37 | .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) | ||
38 | .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) | ||
39 | .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); | ||
40 | |||
41 | local memoryGraph = | ||
42 | graphPanel.new( | ||
43 | 'Memory Usage', | ||
44 | datasource='$datasource', | ||
45 | span=9, | ||
46 | format='bytes', | ||
47 | ) | ||
48 | .addTarget(prometheus.target( | ||
49 | ||| | ||
50 | node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} | ||
51 | - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} | ||
52 | - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} | ||
53 | - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} | ||
54 | ||| % $._config, legendFormat='memory used' | ||
55 | )) | ||
56 | .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) | ||
57 | .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) | ||
58 | .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); | ||
59 | |||
60 | local memoryGauge = gauge.new( | ||
61 | 'Memory Usage', | ||
62 | ||| | ||
63 | node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} | ||
64 | / | ||
65 | node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} | ||
66 | ||| % $._config, | ||
67 | ).withLowerBeingBetter(); | ||
68 | |||
69 | local diskIO = | ||
70 | graphPanel.new( | ||
71 | 'Disk I/O', | ||
72 | datasource='$datasource', | ||
73 | span=9, | ||
74 | ) | ||
75 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) | ||
76 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) | ||
77 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + | ||
78 | { | ||
79 | seriesOverrides: [ | ||
80 | { | ||
81 | alias: 'read', | ||
82 | yaxis: 1, | ||
83 | }, | ||
84 | { | ||
85 | alias: 'io time', | ||
86 | yaxis: 2, | ||
87 | }, | ||
88 | ], | ||
89 | yaxes: [ | ||
90 | self.yaxe(format='bytes'), | ||
91 | self.yaxe(format='ms'), | ||
92 | ], | ||
93 | }; | ||
94 | |||
95 | local diskSpaceUsage = gauge.new( | ||
96 | 'Disk Space Usage', | ||
97 | ||| | ||
98 | 1 - ( | ||
99 | sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} | ||
100 | / | ||
101 | sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} | ||
102 | ) | ||
103 | ||| % $._config, | ||
104 | ).withLowerBeingBetter(); | ||
105 | |||
106 | local networkReceived = | ||
107 | graphPanel.new( | ||
108 | 'Network Received', | ||
109 | datasource='$datasource', | ||
110 | span=6, | ||
111 | format='bytes', | ||
112 | ) | ||
113 | .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); | ||
114 | |||
115 | local networkTransmitted = | ||
116 | graphPanel.new( | ||
117 | 'Network Transmitted', | ||
118 | datasource='$datasource', | ||
119 | span=6, | ||
120 | format='bytes', | ||
121 | ) | ||
122 | .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); | ||
123 | |||
124 | dashboard.new('Nodes', time_from='now-1h') | ||
125 | .addTemplate( | ||
126 | { | ||
127 | current: { | ||
128 | text: 'Prometheus', | ||
129 | value: 'Prometheus', | ||
130 | }, | ||
131 | hide: 0, | ||
132 | label: null, | ||
133 | name: 'datasource', | ||
134 | options: [], | ||
135 | query: 'prometheus', | ||
136 | refresh: 1, | ||
137 | regex: '', | ||
138 | type: 'datasource', | ||
139 | }, | ||
140 | ) | ||
141 | .addTemplate( | ||
142 | template.new( | ||
143 | 'instance', | ||
144 | '$datasource', | ||
145 | 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, | ||
146 | refresh='time', | ||
147 | ) | ||
148 | ) | ||
149 | .addRow( | ||
150 | row.new() | ||
151 | .addPanel(idleCPU) | ||
152 | .addPanel(systemLoad) | ||
153 | ) | ||
154 | .addRow( | ||
155 | row.new() | ||
156 | .addPanel(memoryGraph) | ||
157 | .addPanel(memoryGauge) | ||
158 | ) | ||
159 | .addRow( | ||
160 | row.new() | ||
161 | .addPanel(diskIO) | ||
162 | .addPanel(diskSpaceUsage) | ||
163 | ) | ||
164 | .addRow( | ||
165 | row.new() | ||
166 | .addPanel(networkReceived) | ||
167 | .addPanel(networkTransmitted) | ||
168 | ), | ||
169 | }, | ||
170 | } | ||
diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet new file mode 100644 index 0000000..3e368c8 --- /dev/null +++ b/docs/node-mixin/dashboards/use.libsonnet | |||
@@ -0,0 +1,151 @@ | |||
1 | local g = import 'grafana-builder/grafana.libsonnet'; | ||
2 | |||
3 | { | ||
4 | grafanaDashboards+:: { | ||
5 | 'node-cluster-rsrc-use.json': | ||
6 | local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; | ||
7 | |||
8 | g.dashboard('USE Method / Cluster') | ||
9 | .addRow( | ||
10 | g.row('CPU') | ||
11 | .addPanel( | ||
12 | g.panel('CPU Utilisation') + | ||
13 | g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + | ||
14 | g.stack + | ||
15 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
16 | ) | ||
17 | .addPanel( | ||
18 | g.panel('CPU Saturation (Load1)') + | ||
19 | g.queryPanel(||| | ||
20 | instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) | ||
21 | ||| % $._config, '{{instance}}', legendLink) + | ||
22 | g.stack + | ||
23 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
24 | ) | ||
25 | ) | ||
26 | .addRow( | ||
27 | g.row('Memory') | ||
28 | .addPanel( | ||
29 | g.panel('Memory Utilisation') + | ||
30 | g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + | ||
31 | g.stack + | ||
32 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
33 | ) | ||
34 | .addPanel( | ||
35 | g.panel('Memory Saturation (Swap I/O)') + | ||
36 | g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + | ||
37 | g.stack + | ||
38 | { yaxes: g.yaxes('Bps') }, | ||
39 | ) | ||
40 | ) | ||
41 | .addRow( | ||
42 | g.row('Disk') | ||
43 | .addPanel( | ||
44 | g.panel('Disk IO Utilisation') + | ||
45 | // Full utilisation would be all disks on each node spending an average of | ||
46 | // 1 sec per second doing I/O, normalize by node count for stacked charts | ||
47 | g.queryPanel(||| | ||
48 | instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) | ||
49 | ||| % $._config, '{{instance}}', legendLink) + | ||
50 | g.stack + | ||
51 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
52 | ) | ||
53 | .addPanel( | ||
54 | g.panel('Disk IO Saturation') + | ||
55 | g.queryPanel(||| | ||
56 | instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) | ||
57 | ||| % $._config, '{{instance}}', legendLink) + | ||
58 | g.stack + | ||
59 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
60 | ) | ||
61 | ) | ||
62 | .addRow( | ||
63 | g.row('Network') | ||
64 | .addPanel( | ||
65 | g.panel('Net Utilisation (Transmitted)') + | ||
66 | g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + | ||
67 | g.stack + | ||
68 | { yaxes: g.yaxes('Bps') }, | ||
69 | ) | ||
70 | .addPanel( | ||
71 | g.panel('Net Saturation (Dropped)') + | ||
72 | g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + | ||
73 | g.stack + | ||
74 | { yaxes: g.yaxes('Bps') }, | ||
75 | ) | ||
76 | ) | ||
77 | .addRow( | ||
78 | g.row('Storage') | ||
79 | .addPanel( | ||
80 | g.panel('Disk Capacity') + | ||
81 | g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + | ||
82 | g.stack + | ||
83 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | ||
84 | ), | ||
85 | ), | ||
86 | |||
87 | 'node-rsrc-use.json': | ||
88 | g.dashboard('USE Method / Node') | ||
89 | .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') | ||
90 | .addRow( | ||
91 | g.row('CPU') | ||
92 | .addPanel( | ||
93 | g.panel('CPU Utilisation') + | ||
94 | g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + | ||
95 | { yaxes: g.yaxes('percentunit') }, | ||
96 | ) | ||
97 | .addPanel( | ||
98 | g.panel('CPU Saturation (Load1)') + | ||
99 | g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + | ||
100 | { yaxes: g.yaxes('percentunit') }, | ||
101 | ) | ||
102 | ) | ||
103 | .addRow( | ||
104 | g.row('Memory') | ||
105 | .addPanel( | ||
106 | g.panel('Memory Utilisation') + | ||
107 | g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + | ||
108 | { yaxes: g.yaxes('percentunit') }, | ||
109 | ) | ||
110 | .addPanel( | ||
111 | g.panel('Memory Saturation (Swap I/O)') + | ||
112 | g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + | ||
113 | { yaxes: g.yaxes('Bps') }, | ||
114 | ) | ||
115 | ) | ||
116 | .addRow( | ||
117 | g.row('Disk') | ||
118 | .addPanel( | ||
119 | g.panel('Disk IO Utilisation') + | ||
120 | g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + | ||
121 | { yaxes: g.yaxes('percentunit') }, | ||
122 | ) | ||
123 | .addPanel( | ||
124 | g.panel('Disk IO Saturation') + | ||
125 | g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + | ||
126 | { yaxes: g.yaxes('percentunit') }, | ||
127 | ) | ||
128 | ) | ||
129 | .addRow( | ||
130 | g.row('Net') | ||
131 | .addPanel( | ||
132 | g.panel('Net Utilisation (Transmitted)') + | ||
133 | g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + | ||
134 | { yaxes: g.yaxes('Bps') }, | ||
135 | ) | ||
136 | .addPanel( | ||
137 | g.panel('Net Saturation (Dropped)') + | ||
138 | g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + | ||
139 | { yaxes: g.yaxes('Bps') }, | ||
140 | ) | ||
141 | ) | ||
142 | .addRow( | ||
143 | g.row('Disk') | ||
144 | .addPanel( | ||
145 | g.panel('Disk Utilisation') + | ||
146 | g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + | ||
147 | { yaxes: g.yaxes('percentunit') }, | ||
148 | ), | ||
149 | ), | ||
150 | }, | ||
151 | } | ||
diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json new file mode 100644 index 0000000..45326aa --- /dev/null +++ b/docs/node-mixin/jsonnetfile.json | |||
@@ -0,0 +1,24 @@ | |||
1 | { | ||
2 | "dependencies": [ | ||
3 | { | ||
4 | "name": "grafonnet", | ||
5 | "source": { | ||
6 | "git": { | ||
7 | "remote": "https://github.com/grafana/grafonnet-lib", | ||
8 | "subdir": "grafonnet" | ||
9 | } | ||
10 | }, | ||
11 | "version": "master" | ||
12 | }, | ||
13 | { | ||
14 | "name": "grafana-builder", | ||
15 | "source": { | ||
16 | "git": { | ||
17 | "remote": "https://github.com/kausalco/public", | ||
18 | "subdir": "grafana-builder" | ||
19 | } | ||
20 | }, | ||
21 | "version": "master" | ||
22 | } | ||
23 | ] | ||
24 | } | ||
diff --git a/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet new file mode 100644 index 0000000..43640b6 --- /dev/null +++ b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet | |||
@@ -0,0 +1,60 @@ | |||
1 | local grafana = import 'grafonnet/grafana.libsonnet'; | ||
2 | local singlestat = grafana.singlestat; | ||
3 | local prometheus = grafana.prometheus; | ||
4 | |||
5 | { | ||
6 | new(title, query):: | ||
7 | singlestat.new( | ||
8 | title, | ||
9 | datasource='$datasource', | ||
10 | span=3, | ||
11 | format='percentunit', | ||
12 | valueName='current', | ||
13 | colors=[ | ||
14 | 'rgba(245, 54, 54, 0.9)', | ||
15 | 'rgba(237, 129, 40, 0.89)', | ||
16 | 'rgba(50, 172, 45, 0.97)', | ||
17 | ], | ||
18 | thresholds='50, 80', | ||
19 | valueMaps=[ | ||
20 | { | ||
21 | op: '=', | ||
22 | text: 'N/A', | ||
23 | value: 'null', | ||
24 | }, | ||
25 | ], | ||
26 | ) | ||
27 | .addTarget( | ||
28 | prometheus.target( | ||
29 | query | ||
30 | ) | ||
31 | ) + { | ||
32 | gauge: { | ||
33 | maxValue: 100, | ||
34 | minValue: 0, | ||
35 | show: true, | ||
36 | thresholdLabels: false, | ||
37 | thresholdMarkers: true, | ||
38 | }, | ||
39 | withTextNullValue(text):: self { | ||
40 | valueMaps: [ | ||
41 | { | ||
42 | op: '=', | ||
43 | text: text, | ||
44 | value: 'null', | ||
45 | }, | ||
46 | ], | ||
47 | }, | ||
48 | withSpanSize(size):: self { | ||
49 | span: size, | ||
50 | }, | ||
51 | withLowerBeingBetter():: self { | ||
52 | colors: [ | ||
53 | 'rgba(50, 172, 45, 0.97)', | ||
54 | 'rgba(237, 129, 40, 0.89)', | ||
55 | 'rgba(245, 54, 54, 0.9)', | ||
56 | ], | ||
57 | thresholds: '80, 90', | ||
58 | }, | ||
59 | }, | ||
60 | } | ||
diff --git a/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet new file mode 100644 index 0000000..bc1d6f6 --- /dev/null +++ b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet | |||
@@ -0,0 +1,48 @@ | |||
1 | local grafana = import 'grafonnet/grafana.libsonnet'; | ||
2 | local singlestat = grafana.singlestat; | ||
3 | local prometheus = grafana.prometheus; | ||
4 | |||
5 | { | ||
6 | new(title, query):: | ||
7 | singlestat.new( | ||
8 | title, | ||
9 | datasource='prometheus', | ||
10 | span=3, | ||
11 | valueName='current', | ||
12 | valueMaps=[ | ||
13 | { | ||
14 | op: '=', | ||
15 | text: '0', | ||
16 | value: 'null', | ||
17 | }, | ||
18 | ], | ||
19 | ) | ||
20 | .addTarget( | ||
21 | prometheus.target( | ||
22 | query | ||
23 | ) | ||
24 | ) + { | ||
25 | withTextNullValue(text):: self { | ||
26 | valueMaps: [ | ||
27 | { | ||
28 | op: '=', | ||
29 | text: text, | ||
30 | value: 'null', | ||
31 | }, | ||
32 | ], | ||
33 | }, | ||
34 | withSpanSize(size):: self { | ||
35 | span: size, | ||
36 | }, | ||
37 | withPostfix(postfix):: self { | ||
38 | postfix: postfix, | ||
39 | }, | ||
40 | withSparkline():: self { | ||
41 | sparkline: { | ||
42 | show: true, | ||
43 | lineColor: 'rgb(31, 120, 193)', | ||
44 | fillColor: 'rgba(31, 118, 189, 0.18)', | ||
45 | }, | ||
46 | }, | ||
47 | }, | ||
48 | } | ||
diff --git a/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet new file mode 100644 index 0000000..013ff42 --- /dev/null +++ b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet | |||
@@ -0,0 +1,5 @@ | |||
1 | { | ||
2 | numbersinglestat:: import 'numbersinglestat.libsonnet', | ||
3 | gauge:: import 'gauge.libsonnet', | ||
4 | percentlinegraph:: import 'percentlinegraph.libsonnet', | ||
5 | } | ||
diff --git a/docs/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet new file mode 100644 index 0000000..b9831f9 --- /dev/null +++ b/docs/node-mixin/mixin.libsonnet | |||
@@ -0,0 +1,4 @@ | |||
1 | (import 'config.libsonnet') + | ||
2 | (import 'alerts/alerts.libsonnet') + | ||
3 | (import 'dashboards/dashboards.libsonnet') + | ||
4 | (import 'rules/rules.libsonnet') | ||
diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet new file mode 100644 index 0000000..f836d0d --- /dev/null +++ b/docs/node-mixin/rules/rules.libsonnet | |||
@@ -0,0 +1,106 @@ | |||
1 | { | ||
2 | prometheusRules+:: { | ||
3 | groups+: [ | ||
4 | { | ||
5 | name: 'node-exporter.rules', | ||
6 | rules: [ | ||
7 | { | ||
8 | // This rule gives the number of CPUs per node. | ||
9 | record: 'instance:node_num_cpu:sum', | ||
10 | expr: ||| | ||
11 | count by (instance) ( | ||
12 | sum by (instance, cpu) ( | ||
13 | node_cpu_seconds_total{%(nodeExporterSelector)s} | ||
14 | ) | ||
15 | ) | ||
16 | ||| % $._config, | ||
17 | }, | ||
18 | { | ||
19 | // CPU utilisation is % CPU is not idle. | ||
20 | record: 'instance:node_cpu_utilisation:avg1m', | ||
21 | expr: ||| | ||
22 | 1 - avg by (instance) ( | ||
23 | rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) | ||
24 | ) | ||
25 | ||| % $._config, | ||
26 | }, | ||
27 | { | ||
28 | // CPU saturation is 1min avg run queue length / number of CPUs. | ||
29 | // Can go over 100%. >100% is bad. | ||
30 | record: 'instance:node_cpu_saturation_load1:', | ||
31 | expr: ||| | ||
32 | sum by (instance) (node_load1{%(nodeExporterSelector)s}) | ||
33 | / | ||
34 | instance:node_num_cpu:sum | ||
35 | ||| % $._config, | ||
36 | }, | ||
37 | { | ||
38 | // Total memory per node | ||
39 | record: 'instance:node_memory_bytes_total:sum', | ||
40 | expr: ||| | ||
41 | sum by (instance) ( | ||
42 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s} | ||
43 | ) | ||
44 | ||| % $._config, | ||
45 | }, | ||
46 | { | ||
47 | // Memory utilisation per node, normalized by per-node memory | ||
48 | record: 'instance:node_memory_utilisation:ratio', | ||
49 | expr: ||| | ||
50 | 1 - ( | ||
51 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} | ||
52 | / | ||
53 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s} | ||
54 | ) | ||
55 | ||| % $._config, | ||
56 | }, | ||
57 | { | ||
58 | record: 'instance:node_memory_swap_io_bytes:sum_rate', | ||
59 | expr: ||| | ||
60 | 1e3 * sum by (instance) ( | ||
61 | (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) | ||
62 | + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) | ||
63 | ) | ||
64 | ||| % $._config, | ||
65 | }, | ||
66 | { | ||
67 | // Disk utilisation (ms spent, 1 second irate()) | ||
68 | record: 'instance:node_disk_utilisation:sum_irate', | ||
69 | expr: ||| | ||
70 | sum by (instance) ( | ||
71 | irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) | ||
72 | ) | ||
73 | ||| % $._config, | ||
74 | }, | ||
75 | { | ||
76 | // Disk saturation (ms spent, by rate() it's bound by 1 second) | ||
77 | record: 'instance:node_disk_saturation:sum_irate', | ||
78 | expr: ||| | ||
79 | sum by (instance) ( | ||
80 | irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) | ||
81 | ) | ||
82 | ||| % $._config, | ||
83 | }, | ||
84 | { | ||
85 | record: 'instance:node_net_utilisation:sum_irate', | ||
86 | expr: ||| | ||
87 | sum by (instance) ( | ||
88 | (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + | ||
89 | irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) | ||
90 | ) | ||
91 | ||| % $._config, | ||
92 | }, | ||
93 | { | ||
94 | record: 'instance:node_net_saturation:sum_irate', | ||
95 | expr: ||| | ||
96 | sum by (instance) ( | ||
97 | (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + | ||
98 | irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) | ||
99 | ) | ||
100 | ||| % $._config, | ||
101 | }, | ||
102 | ], | ||
103 | }, | ||
104 | ], | ||
105 | }, | ||
106 | } | ||