aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorbeorn7 <beorn@grafana.com>2019-07-05 19:38:03 +0200
committerbeorn7 <beorn@grafana.com>2019-07-05 19:38:03 +0200
commit2df034c05512628fc1946f5031773790b644abfc (patch)
tree89aba5d892678984d434cdb6366c27df9935c1a8 /docs
parent61bcc5b4681230e07b96dbb49a9a7f5e301062bf (diff)
downloadprometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.tar.bz2
prometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.tar.xz
prometheus_node_collector-2df034c05512628fc1946f5031773790b644abfc.zip
Move node-mixin into docs directory
Signed-off-by: beorn7 <beorn@grafana.com>
Diffstat (limited to 'docs')
-rw-r--r--docs/node-mixin/.gitignore3
-rw-r--r--docs/node-mixin/alerts/alerts.libsonnet165
-rw-r--r--docs/node-mixin/config.libsonnet11
-rw-r--r--docs/node-mixin/dashboards/dashboards.libsonnet2
-rw-r--r--docs/node-mixin/dashboards/node.libsonnet170
-rw-r--r--docs/node-mixin/dashboards/use.libsonnet151
-rw-r--r--docs/node-mixin/jsonnetfile.json24
-rw-r--r--docs/node-mixin/lib/promgrafonnet/gauge.libsonnet60
-rw-r--r--docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet48
-rw-r--r--docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet5
-rw-r--r--docs/node-mixin/mixin.libsonnet4
-rw-r--r--docs/node-mixin/rules/rules.libsonnet106
12 files changed, 749 insertions, 0 deletions
diff --git a/docs/node-mixin/.gitignore b/docs/node-mixin/.gitignore
new file mode 100644
index 0000000..65d141b
--- /dev/null
+++ b/docs/node-mixin/.gitignore
@@ -0,0 +1,3 @@
1/jsonnetfile.lock.json
2/vendor/
3
diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet
new file mode 100644
index 0000000..8ea70cc
--- /dev/null
+++ b/docs/node-mixin/alerts/alerts.libsonnet
@@ -0,0 +1,165 @@
1{
2 prometheusAlerts+:: {
3 groups+: [
4 {
5 name: 'node-exporter',
6 rules: [
7 {
8 alert: 'NodeFilesystemSpaceFillingUp',
9 expr: |||
10 predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
11 and
12 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
13 and
14 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
15 ||| % $._config,
16 'for': '1h',
17 labels: {
18 severity: 'warning',
19 },
20 annotations: {
21 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.',
22 },
23 },
24 {
25 alert: 'NodeFilesystemSpaceFillingUp',
26 expr: |||
27 predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
28 and
29 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
30 and
31 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
32 ||| % $._config,
33 'for': '1h',
34 labels: {
35 severity: 'critical',
36 },
37 annotations: {
38 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.',
39 },
40 },
41 {
42 alert: 'NodeFilesystemOutOfSpace',
43 expr: |||
44 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
45 and
46 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
47 ||| % $._config,
48 'for': '1h',
49 labels: {
50 severity: 'warning',
51 },
52 annotations: {
53 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
54 },
55 },
56 {
57 alert: 'NodeFilesystemOutOfSpace',
58 expr: |||
59 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
60 and
61 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
62 ||| % $._config,
63 'for': '1h',
64 labels: {
65 severity: 'critical',
66 },
67 annotations: {
68 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
69 },
70 },
71 {
72 alert: 'NodeFilesystemFilesFillingUp',
73 expr: |||
74 predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
75 and
76 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
77 and
78 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
79 ||| % $._config,
80 'for': '1h',
81 labels: {
82 severity: 'warning',
83 },
84 annotations: {
85 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.',
86 },
87 },
88 {
89 alert: 'NodeFilesystemFilesFillingUp',
90 expr: |||
91 predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
92 and
93 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
94 and
95 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
96 ||| % $._config,
97 'for': '1h',
98 labels: {
99 severity: 'critical',
100 },
101 annotations: {
102 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.',
103 },
104 },
105 {
106 alert: 'NodeFilesystemOutOfFiles',
107 expr: |||
108 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
109 and
110 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
111 ||| % $._config,
112 'for': '1h',
113 labels: {
114 severity: 'warning',
115 },
116 annotations: {
117 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.',
118 },
119 },
120 {
121 alert: 'NodeFilesystemOutOfSpace',
122 expr: |||
123 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
124 and
125 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
126 ||| % $._config,
127 'for': '1h',
128 labels: {
129 severity: 'critical',
130 },
131 annotations: {
132 message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
133 },
134 },
135 {
136 alert: 'NodeNetworkReceiveErrs',
137 expr: |||
138 increase(node_network_receive_errs_total[2m]) > 10
139 ||| % $._config,
140 'for': '1h',
141 labels: {
142 severity: 'critical',
143 },
144 annotations: {
145 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
146 },
147 },
148 {
149 alert: 'NodeNetworkTransmitErrs',
150 expr: |||
151 increase(node_network_transmit_errs_total[2m]) > 10
152 ||| % $._config,
153 'for': '1h',
154 labels: {
155 severity: 'critical',
156 },
157 annotations: {
158 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
159 },
160 },
161 ],
162 },
163 ],
164 },
165}
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet
new file mode 100644
index 0000000..6c5d6f7
--- /dev/null
+++ b/docs/node-mixin/config.libsonnet
@@ -0,0 +1,11 @@
1{
2 _config+:: {
3 // Selectors are inserted between {} in Prometheus queries.
4 nodeExporterSelector: 'job="node-exporter"',
5
6 // Mainly extracted because they are repetitive, but also useful to customize.
7 fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"',
8
9 grafana_prefix: '',
10 },
11}
diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet
new file mode 100644
index 0000000..e6adbd4
--- /dev/null
+++ b/docs/node-mixin/dashboards/dashboards.libsonnet
@@ -0,0 +1,2 @@
1(import 'node.libsonnet') +
2(import 'use.libsonnet')
diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet
new file mode 100644
index 0000000..4594e3e
--- /dev/null
+++ b/docs/node-mixin/dashboards/node.libsonnet
@@ -0,0 +1,170 @@
1local grafana = import 'grafonnet/grafana.libsonnet';
2local dashboard = grafana.dashboard;
3local row = grafana.row;
4local prometheus = grafana.prometheus;
5local template = grafana.template;
6local graphPanel = grafana.graphPanel;
7local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet';
8local gauge = promgrafonnet.gauge;
9
10{
11 grafanaDashboards+:: {
12 'nodes.json':
13 local idleCPU =
14 graphPanel.new(
15 'Idle CPU',
16 datasource='$datasource',
17 span=6,
18 format='percentunit',
19 max=100,
20 min=0,
21 )
22 .addTarget(prometheus.target(
23 |||
24 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
25 ||| % $._config,
26 legendFormat='{{cpu}}',
27 intervalFactor=10,
28 ));
29
30 local systemLoad =
31 graphPanel.new(
32 'System load',
33 datasource='$datasource',
34 span=6,
35 format='percentunit',
36 )
37 .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m'))
38 .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m'))
39 .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m'));
40
41 local memoryGraph =
42 graphPanel.new(
43 'Memory Usage',
44 datasource='$datasource',
45 span=9,
46 format='bytes',
47 )
48 .addTarget(prometheus.target(
49 |||
50 node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
51 - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
52 - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
53 - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
54 ||| % $._config, legendFormat='memory used'
55 ))
56 .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
57 .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
58 .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
59
60 local memoryGauge = gauge.new(
61 'Memory Usage',
62 |||
63 node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"}
64 /
65 node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
66 ||| % $._config,
67 ).withLowerBeingBetter();
68
69 local diskIO =
70 graphPanel.new(
71 'Disk I/O',
72 datasource='$datasource',
73 span=9,
74 )
75 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
76 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
77 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
78 {
79 seriesOverrides: [
80 {
81 alias: 'read',
82 yaxis: 1,
83 },
84 {
85 alias: 'io time',
86 yaxis: 2,
87 },
88 ],
89 yaxes: [
90 self.yaxe(format='bytes'),
91 self.yaxe(format='ms'),
92 ],
93 };
94
95 local diskSpaceUsage = gauge.new(
96 'Disk Space Usage',
97 |||
98 1 - (
99 sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
100 /
101 sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
102 )
103 ||| % $._config,
104 ).withLowerBeingBetter();
105
106 local networkReceived =
107 graphPanel.new(
108 'Network Received',
109 datasource='$datasource',
110 span=6,
111 format='bytes',
112 )
113 .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
114
115 local networkTransmitted =
116 graphPanel.new(
117 'Network Transmitted',
118 datasource='$datasource',
119 span=6,
120 format='bytes',
121 )
122 .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
123
124 dashboard.new('Nodes', time_from='now-1h')
125 .addTemplate(
126 {
127 current: {
128 text: 'Prometheus',
129 value: 'Prometheus',
130 },
131 hide: 0,
132 label: null,
133 name: 'datasource',
134 options: [],
135 query: 'prometheus',
136 refresh: 1,
137 regex: '',
138 type: 'datasource',
139 },
140 )
141 .addTemplate(
142 template.new(
143 'instance',
144 '$datasource',
145 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
146 refresh='time',
147 )
148 )
149 .addRow(
150 row.new()
151 .addPanel(idleCPU)
152 .addPanel(systemLoad)
153 )
154 .addRow(
155 row.new()
156 .addPanel(memoryGraph)
157 .addPanel(memoryGauge)
158 )
159 .addRow(
160 row.new()
161 .addPanel(diskIO)
162 .addPanel(diskSpaceUsage)
163 )
164 .addRow(
165 row.new()
166 .addPanel(networkReceived)
167 .addPanel(networkTransmitted)
168 ),
169 },
170}
diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet
new file mode 100644
index 0000000..3e368c8
--- /dev/null
+++ b/docs/node-mixin/dashboards/use.libsonnet
@@ -0,0 +1,151 @@
1local g = import 'grafana-builder/grafana.libsonnet';
2
3{
4 grafanaDashboards+:: {
5 'node-cluster-rsrc-use.json':
6 local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
7
8 g.dashboard('USE Method / Cluster')
9 .addRow(
10 g.row('CPU')
11 .addPanel(
12 g.panel('CPU Utilisation') +
13 g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
14 g.stack +
15 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
16 )
17 .addPanel(
18 g.panel('CPU Saturation (Load1)') +
19 g.queryPanel(|||
20 instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
21 ||| % $._config, '{{instance}}', legendLink) +
22 g.stack +
23 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
24 )
25 )
26 .addRow(
27 g.row('Memory')
28 .addPanel(
29 g.panel('Memory Utilisation') +
30 g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) +
31 g.stack +
32 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
33 )
34 .addPanel(
35 g.panel('Memory Saturation (Swap I/O)') +
36 g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) +
37 g.stack +
38 { yaxes: g.yaxes('Bps') },
39 )
40 )
41 .addRow(
42 g.row('Disk')
43 .addPanel(
44 g.panel('Disk IO Utilisation') +
45 // Full utilisation would be all disks on each node spending an average of
46 // 1 sec per second doing I/O, normalize by node count for stacked charts
47 g.queryPanel(|||
48 instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
49 ||| % $._config, '{{instance}}', legendLink) +
50 g.stack +
51 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
52 )
53 .addPanel(
54 g.panel('Disk IO Saturation') +
55 g.queryPanel(|||
56 instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
57 ||| % $._config, '{{instance}}', legendLink) +
58 g.stack +
59 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
60 )
61 )
62 .addRow(
63 g.row('Network')
64 .addPanel(
65 g.panel('Net Utilisation (Transmitted)') +
66 g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) +
67 g.stack +
68 { yaxes: g.yaxes('Bps') },
69 )
70 .addPanel(
71 g.panel('Net Saturation (Dropped)') +
72 g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) +
73 g.stack +
74 { yaxes: g.yaxes('Bps') },
75 )
76 )
77 .addRow(
78 g.row('Storage')
79 .addPanel(
80 g.panel('Disk Capacity') +
81 g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
82 g.stack +
83 { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
84 ),
85 ),
86
87 'node-rsrc-use.json':
88 g.dashboard('USE Method / Node')
89 .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
90 .addRow(
91 g.row('CPU')
92 .addPanel(
93 g.panel('CPU Utilisation') +
94 g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') +
95 { yaxes: g.yaxes('percentunit') },
96 )
97 .addPanel(
98 g.panel('CPU Saturation (Load1)') +
99 g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') +
100 { yaxes: g.yaxes('percentunit') },
101 )
102 )
103 .addRow(
104 g.row('Memory')
105 .addPanel(
106 g.panel('Memory Utilisation') +
107 g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') +
108 { yaxes: g.yaxes('percentunit') },
109 )
110 .addPanel(
111 g.panel('Memory Saturation (Swap I/O)') +
112 g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
113 { yaxes: g.yaxes('Bps') },
114 )
115 )
116 .addRow(
117 g.row('Disk')
118 .addPanel(
119 g.panel('Disk IO Utilisation') +
120 g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
121 { yaxes: g.yaxes('percentunit') },
122 )
123 .addPanel(
124 g.panel('Disk IO Saturation') +
125 g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') +
126 { yaxes: g.yaxes('percentunit') },
127 )
128 )
129 .addRow(
130 g.row('Net')
131 .addPanel(
132 g.panel('Net Utilisation (Transmitted)') +
133 g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
134 { yaxes: g.yaxes('Bps') },
135 )
136 .addPanel(
137 g.panel('Net Saturation (Dropped)') +
138 g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') +
139 { yaxes: g.yaxes('Bps') },
140 )
141 )
142 .addRow(
143 g.row('Disk')
144 .addPanel(
145 g.panel('Disk Utilisation') +
146 g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') +
147 { yaxes: g.yaxes('percentunit') },
148 ),
149 ),
150 },
151}
diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json
new file mode 100644
index 0000000..45326aa
--- /dev/null
+++ b/docs/node-mixin/jsonnetfile.json
@@ -0,0 +1,24 @@
1{
2 "dependencies": [
3 {
4 "name": "grafonnet",
5 "source": {
6 "git": {
7 "remote": "https://github.com/grafana/grafonnet-lib",
8 "subdir": "grafonnet"
9 }
10 },
11 "version": "master"
12 },
13 {
14 "name": "grafana-builder",
15 "source": {
16 "git": {
17 "remote": "https://github.com/kausalco/public",
18 "subdir": "grafana-builder"
19 }
20 },
21 "version": "master"
22 }
23 ]
24}
diff --git a/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet
new file mode 100644
index 0000000..43640b6
--- /dev/null
+++ b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet
@@ -0,0 +1,60 @@
1local grafana = import 'grafonnet/grafana.libsonnet';
2local singlestat = grafana.singlestat;
3local prometheus = grafana.prometheus;
4
5{
6 new(title, query)::
7 singlestat.new(
8 title,
9 datasource='$datasource',
10 span=3,
11 format='percentunit',
12 valueName='current',
13 colors=[
14 'rgba(245, 54, 54, 0.9)',
15 'rgba(237, 129, 40, 0.89)',
16 'rgba(50, 172, 45, 0.97)',
17 ],
18 thresholds='50, 80',
19 valueMaps=[
20 {
21 op: '=',
22 text: 'N/A',
23 value: 'null',
24 },
25 ],
26 )
27 .addTarget(
28 prometheus.target(
29 query
30 )
31 ) + {
32 gauge: {
33 maxValue: 100,
34 minValue: 0,
35 show: true,
36 thresholdLabels: false,
37 thresholdMarkers: true,
38 },
39 withTextNullValue(text):: self {
40 valueMaps: [
41 {
42 op: '=',
43 text: text,
44 value: 'null',
45 },
46 ],
47 },
48 withSpanSize(size):: self {
49 span: size,
50 },
51 withLowerBeingBetter():: self {
52 colors: [
53 'rgba(50, 172, 45, 0.97)',
54 'rgba(237, 129, 40, 0.89)',
55 'rgba(245, 54, 54, 0.9)',
56 ],
57 thresholds: '80, 90',
58 },
59 },
60}
diff --git a/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
new file mode 100644
index 0000000..bc1d6f6
--- /dev/null
+++ b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
@@ -0,0 +1,48 @@
1local grafana = import 'grafonnet/grafana.libsonnet';
2local singlestat = grafana.singlestat;
3local prometheus = grafana.prometheus;
4
5{
6 new(title, query)::
7 singlestat.new(
8 title,
9 datasource='prometheus',
10 span=3,
11 valueName='current',
12 valueMaps=[
13 {
14 op: '=',
15 text: '0',
16 value: 'null',
17 },
18 ],
19 )
20 .addTarget(
21 prometheus.target(
22 query
23 )
24 ) + {
25 withTextNullValue(text):: self {
26 valueMaps: [
27 {
28 op: '=',
29 text: text,
30 value: 'null',
31 },
32 ],
33 },
34 withSpanSize(size):: self {
35 span: size,
36 },
37 withPostfix(postfix):: self {
38 postfix: postfix,
39 },
40 withSparkline():: self {
41 sparkline: {
42 show: true,
43 lineColor: 'rgb(31, 120, 193)',
44 fillColor: 'rgba(31, 118, 189, 0.18)',
45 },
46 },
47 },
48}
diff --git a/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
new file mode 100644
index 0000000..013ff42
--- /dev/null
+++ b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
@@ -0,0 +1,5 @@
1{
2 numbersinglestat:: import 'numbersinglestat.libsonnet',
3 gauge:: import 'gauge.libsonnet',
4 percentlinegraph:: import 'percentlinegraph.libsonnet',
5}
diff --git a/docs/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet
new file mode 100644
index 0000000..b9831f9
--- /dev/null
+++ b/docs/node-mixin/mixin.libsonnet
@@ -0,0 +1,4 @@
1(import 'config.libsonnet') +
2(import 'alerts/alerts.libsonnet') +
3(import 'dashboards/dashboards.libsonnet') +
4(import 'rules/rules.libsonnet')
diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet
new file mode 100644
index 0000000..f836d0d
--- /dev/null
+++ b/docs/node-mixin/rules/rules.libsonnet
@@ -0,0 +1,106 @@
1{
2 prometheusRules+:: {
3 groups+: [
4 {
5 name: 'node-exporter.rules',
6 rules: [
7 {
8 // This rule gives the number of CPUs per node.
9 record: 'instance:node_num_cpu:sum',
10 expr: |||
11 count by (instance) (
12 sum by (instance, cpu) (
13 node_cpu_seconds_total{%(nodeExporterSelector)s}
14 )
15 )
16 ||| % $._config,
17 },
18 {
19 // CPU utilisation is % CPU is not idle.
20 record: 'instance:node_cpu_utilisation:avg1m',
21 expr: |||
22 1 - avg by (instance) (
23 rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m])
24 )
25 ||| % $._config,
26 },
27 {
28 // CPU saturation is 1min avg run queue length / number of CPUs.
29 // Can go over 100%. >100% is bad.
30 record: 'instance:node_cpu_saturation_load1:',
31 expr: |||
32 sum by (instance) (node_load1{%(nodeExporterSelector)s})
33 /
34 instance:node_num_cpu:sum
35 ||| % $._config,
36 },
37 {
38 // Total memory per node
39 record: 'instance:node_memory_bytes_total:sum',
40 expr: |||
41 sum by (instance) (
42 node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
43 )
44 ||| % $._config,
45 },
46 {
47 // Memory utilisation per node, normalized by per-node memory
48 record: 'instance:node_memory_utilisation:ratio',
49 expr: |||
50 1 - (
51 node_memory_MemAvailable_bytes{%(nodeExporterSelector)s}
52 /
53 node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
54 )
55 ||| % $._config,
56 },
57 {
58 record: 'instance:node_memory_swap_io_bytes:sum_rate',
59 expr: |||
60 1e3 * sum by (instance) (
61 (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
62 + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]))
63 )
64 ||| % $._config,
65 },
66 {
67 // Disk utilisation (ms spent, 1 second irate())
68 record: 'instance:node_disk_utilisation:sum_irate',
69 expr: |||
70 sum by (instance) (
71 irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
72 )
73 ||| % $._config,
74 },
75 {
76 // Disk saturation (ms spent, by rate() it's bound by 1 second)
77 record: 'instance:node_disk_saturation:sum_irate',
78 expr: |||
79 sum by (instance) (
80 irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
81 )
82 ||| % $._config,
83 },
84 {
85 record: 'instance:node_net_utilisation:sum_irate',
86 expr: |||
87 sum by (instance) (
88 (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) +
89 irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]))
90 )
91 ||| % $._config,
92 },
93 {
94 record: 'instance:node_net_saturation:sum_irate',
95 expr: |||
96 sum by (instance) (
97 (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) +
98 irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]))
99 )
100 ||| % $._config,
101 },
102 ],
103 },
104 ],
105 },
106}