diff options
Diffstat (limited to 'docs')
-rw-r--r-- | docs/node-mixin/dashboards/node.libsonnet | 50 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/use.libsonnet | 71 | ||||
-rw-r--r-- | docs/node-mixin/rules/rules.libsonnet | 49 |
3 files changed, 105 insertions, 65 deletions
diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 115d98c..040d60a 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet | |||
@@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge; | |||
21 | ) | 21 | ) |
22 | .addTarget(prometheus.target( | 22 | .addTarget(prometheus.target( |
23 | ||| | 23 | ||| |
24 | 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) | 24 | 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) |
25 | ||| % $._config, | 25 | ||| % $._config, |
26 | legendFormat='{{cpu}}', | 26 | legendFormat='{{cpu}}', |
27 | intervalFactor=10, | 27 | intervalFactor=10, |
28 | )); | 28 | )); |
29 | 29 | ||
30 | // TODO: Is this panel useful? | ||
30 | local systemLoad = | 31 | local systemLoad = |
31 | graphPanel.new( | 32 | graphPanel.new( |
32 | 'System load', | 33 | 'Load Average', |
33 | datasource='$datasource', | 34 | datasource='$datasource', |
34 | span=6, | 35 | span=6, |
35 | format='percentunit', | 36 | format='short', |
36 | ) | 37 | ) |
37 | .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) | 38 | .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) |
38 | .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) | 39 | .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) |
39 | .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); | 40 | .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')); |
40 | 41 | ||
41 | local memoryGraph = | 42 | local memoryGraph = |
42 | graphPanel.new( | 43 | graphPanel.new( |
@@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge; | |||
48 | .addTarget(prometheus.target( | 49 | .addTarget(prometheus.target( |
49 | ||| | 50 | ||| |
50 | ( | 51 | ( |
51 | node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} | 52 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} |
52 | - | 53 | - |
53 | node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} | 54 | node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} |
54 | - | 55 | - |
55 | node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} | 56 | node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} |
56 | - | 57 | - |
57 | node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} | 58 | node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} |
58 | ) | 59 | ) |
59 | ||| % $._config, legendFormat='memory used' | 60 | ||| % $._config, legendFormat='memory used' |
60 | )) | 61 | )) |
61 | .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) | 62 | .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) |
62 | .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) | 63 | .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) |
63 | .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); | 64 | .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); |
64 | 65 | ||
65 | local memoryGauge = gauge.new( | 66 | local memoryGauge = gauge.new( |
66 | 'Memory Usage', | 67 | 'Memory Usage', |
67 | ||| | 68 | ||| |
68 | ( | 69 | ( |
69 | node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} | 70 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} |
70 | / | 71 | / |
71 | node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} | 72 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} |
72 | ) | 73 | ) |
73 | * 100 | 74 | * 100 |
74 | ||| % $._config, | 75 | ||| % $._config, |
@@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge; | |||
80 | datasource='$datasource', | 81 | datasource='$datasource', |
81 | span=9, | 82 | span=9, |
82 | ) | 83 | ) |
83 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) | 84 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) |
84 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) | 85 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) |
85 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + | 86 | .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + |
86 | { | 87 | { |
87 | seriesOverrides: [ | 88 | seriesOverrides: [ |
88 | { | 89 | { |
@@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge; | |||
96 | ], | 97 | ], |
97 | yaxes: [ | 98 | yaxes: [ |
98 | self.yaxe(format='bytes'), | 99 | self.yaxe(format='bytes'), |
99 | self.yaxe(format='ms'), | 100 | self.yaxe(format='s'), |
100 | ], | 101 | ], |
101 | }; | 102 | }; |
102 | 103 | ||
104 | // TODO: Should this be partitioned by mountpoint? | ||
103 | local diskSpaceUsage = gauge.new( | 105 | local diskSpaceUsage = gauge.new( |
104 | 'Disk Space Usage', | 106 | 'Disk Space Usage', |
105 | ||| | 107 | ||| |
106 | 100 - | 108 | 100 - |
107 | ( | 109 | ( |
108 | sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} | 110 | sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} |
109 | / | 111 | / |
110 | sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} | 112 | sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} |
111 | * 100 | 113 | * 100 |
112 | ) | 114 | ) |
113 | ||| % $._config, | 115 | ||| % $._config, |
@@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge; | |||
120 | span=6, | 122 | span=6, |
121 | format='bytes', | 123 | format='bytes', |
122 | ) | 124 | ) |
123 | .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); | 125 | .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); |
124 | 126 | ||
125 | local networkTransmitted = | 127 | local networkTransmitted = |
126 | graphPanel.new( | 128 | graphPanel.new( |
@@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge; | |||
129 | span=6, | 131 | span=6, |
130 | format='bytes', | 132 | format='bytes', |
131 | ) | 133 | ) |
132 | .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); | 134 | .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); |
133 | 135 | ||
134 | dashboard.new('Nodes', time_from='now-1h') | 136 | dashboard.new('Nodes', time_from='now-1h') |
135 | .addTemplate( | 137 | .addTemplate( |
@@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge; | |||
152 | template.new( | 154 | template.new( |
153 | 'instance', | 155 | 'instance', |
154 | '$datasource', | 156 | '$datasource', |
155 | 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, | 157 | 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, |
156 | refresh='time', | 158 | refresh='time', |
157 | ) | 159 | ) |
158 | ) | 160 | ) |
diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 9bba604..96bf0f5 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet | |||
@@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
10 | g.row('CPU') | 10 | g.row('CPU') |
11 | .addPanel( | 11 | .addPanel( |
12 | g.panel('CPU Utilisation') + | 12 | g.panel('CPU Utilisation') + |
13 | g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + | 13 | g.queryPanel(||| |
14 | ( | ||
15 | instance:node_cpu_utilisation:avg1m | ||
16 | * | ||
17 | instance:node_num_cpu:sum | ||
18 | / ignoring (instance) group_left | ||
19 | sum without (instance) (instance:node_num_cpu:sum) | ||
20 | ) | ||
21 | |||, '{{instance}}', legendLink) + | ||
14 | g.stack + | 22 | g.stack + |
15 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 23 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
16 | ) | 24 | ) |
17 | .addPanel( | 25 | .addPanel( |
18 | g.panel('CPU Saturation (Load1)') + | 26 | // TODO: Is this a useful panel? |
27 | g.panel('CPU Saturation (load1 per CPU)') + | ||
19 | g.queryPanel(||| | 28 | g.queryPanel(||| |
20 | instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) | 29 | ( |
21 | ||| % $._config, '{{instance}}', legendLink) + | 30 | instance:node_load1_per_cpu:ratio |
31 | / ignoring (instance) group_left | ||
32 | count without (instance) (instance:node_load1_per_cpu:ratio) | ||
33 | ) | ||
34 | |||, '{{instance}}', legendLink) + | ||
22 | g.stack + | 35 | g.stack + |
36 | // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. | ||
23 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 37 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
24 | ) | 38 | ) |
25 | ) | 39 | ) |
@@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
43 | .addPanel( | 57 | .addPanel( |
44 | g.panel('Disk IO Utilisation') + | 58 | g.panel('Disk IO Utilisation') + |
45 | // Full utilisation would be all disks on each node spending an average of | 59 | // Full utilisation would be all disks on each node spending an average of |
46 | // 1 sec per second doing I/O, normalize by node count for stacked charts | 60 | // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. |
47 | g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) + | 61 | g.queryPanel(||| |
62 | ( | ||
63 | instance:node_disk_utilisation:sum_irate | ||
64 | / ignoring (instance) group_left | ||
65 | count without (instance) (instance:node_disk_utilisation:sum_irate) | ||
66 | ) | ||
67 | |||, '{{instance}}', legendLink) + | ||
48 | g.stack + | 68 | g.stack + |
49 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 69 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
50 | ) | 70 | ) |
51 | .addPanel( | 71 | .addPanel( |
52 | g.panel('Disk IO Saturation') + | 72 | g.panel('Disk IO Saturation') + |
53 | g.queryPanel(||| | 73 | g.queryPanel(||| |
54 | instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) | 74 | ( |
55 | ||| % $._config, '{{instance}}', legendLink) + | 75 | instance:node_disk_saturation:sum_irate |
76 | / ignoring (instance) group_left | ||
77 | count without (instance) (instance:node_disk_saturation:sum_irate) | ||
78 | ) | ||
79 | |||, '{{instance}}', legendLink) + | ||
56 | g.stack + | 80 | g.stack + |
57 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 81 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
58 | ) | 82 | ) |
@@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
76 | g.row('Storage') | 100 | g.row('Storage') |
77 | .addPanel( | 101 | .addPanel( |
78 | g.panel('Disk Capacity') + | 102 | g.panel('Disk Capacity') + |
79 | g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + | 103 | g.queryPanel(||| |
104 | ( | ||
105 | sum without (device) ( | ||
106 | max without (fstype, mountpoint) ( | ||
107 | node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"} | ||
108 | ) | ||
109 | ) | ||
110 | / ignoring (instance) group_left | ||
111 | sum without (instance, device) ( | ||
112 | max without (fstype, mountpoint) ( | ||
113 | node_filesystem_size_bytes{fstype=~"ext[24]"} | ||
114 | ) | ||
115 | ) | ||
116 | ) | ||
117 | |||, '{{instance}}', legendLink) + | ||
80 | g.stack + | 118 | g.stack + |
81 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 119 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
82 | ), | 120 | ), |
@@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
106 | { yaxes: g.yaxes('percentunit') }, | 144 | { yaxes: g.yaxes('percentunit') }, |
107 | ) | 145 | ) |
108 | .addPanel( | 146 | .addPanel( |
109 | g.panel('Memory Saturation (Swap I/O)') + | 147 | g.panel('Memory Saturation (pages swapped per second)') + |
110 | g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + | 148 | g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + |
111 | { yaxes: g.yaxes('Bps') }, | 149 | { yaxes: g.yaxes('short') }, |
112 | ) | 150 | ) |
113 | ) | 151 | ) |
114 | .addRow( | 152 | .addRow( |
@@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
141 | g.row('Disk') | 179 | g.row('Disk') |
142 | .addPanel( | 180 | .addPanel( |
143 | g.panel('Disk Utilisation') + | 181 | g.panel('Disk Utilisation') + |
144 | g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + | 182 | g.queryPanel(||| |
183 | 1 - | ||
184 | ( | ||
185 | sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"})) | ||
186 | / | ||
187 | sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"})) | ||
188 | ) | ||
189 | |||, 'Disk') + | ||
145 | { yaxes: g.yaxes('percentunit') }, | 190 | { yaxes: g.yaxes('percentunit') }, |
146 | ), | 191 | ), |
147 | ), | 192 | ), |
diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 27636aa..6bd39a5 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet | |||
@@ -8,8 +8,8 @@ | |||
8 | // This rule gives the number of CPUs per node. | 8 | // This rule gives the number of CPUs per node. |
9 | record: 'instance:node_num_cpu:sum', | 9 | record: 'instance:node_num_cpu:sum', |
10 | expr: ||| | 10 | expr: ||| |
11 | count by (instance) ( | 11 | count without (cpu) ( |
12 | sum by (instance, cpu) ( | 12 | sum without (mode) ( |
13 | node_cpu_seconds_total{%(nodeExporterSelector)s} | 13 | node_cpu_seconds_total{%(nodeExporterSelector)s} |
14 | ) | 14 | ) |
15 | ) | 15 | ) |
@@ -19,29 +19,20 @@ | |||
19 | // CPU utilisation is % CPU is not idle. | 19 | // CPU utilisation is % CPU is not idle. |
20 | record: 'instance:node_cpu_utilisation:avg1m', | 20 | record: 'instance:node_cpu_utilisation:avg1m', |
21 | expr: ||| | 21 | expr: ||| |
22 | 1 - avg by (instance) ( | 22 | 1 - avg without (cpu, mode) ( |
23 | rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) | 23 | rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) |
24 | ) | 24 | ) |
25 | ||| % $._config, | 25 | ||| % $._config, |
26 | }, | 26 | }, |
27 | { | 27 | { |
28 | // CPU saturation is 1min avg run queue length / number of CPUs. | 28 | // This is CPU saturation: 1min avg run queue length / number of CPUs. |
29 | // Can go over 100%. >100% is bad. | 29 | // Can go over 1. >1 is bad. |
30 | record: 'instance:node_cpu_saturation_load1:', | 30 | record: 'instance:node_load1_per_cpu:ratio', |
31 | expr: ||| | 31 | expr: ||| |
32 | ( | 32 | ( |
33 | sum by (instance) (node_load1{%(nodeExporterSelector)s}) | 33 | node_load1{%(nodeExporterSelector)s} |
34 | / | 34 | / |
35 | instance:node_num_cpu:sum | 35 | instance:node_num_cpu:sum{%(nodeExporterSelector)s} |
36 | ) | ||
37 | ||| % $._config, | ||
38 | }, | ||
39 | { | ||
40 | // Total memory per node | ||
41 | record: 'instance:node_memory_bytes_total:sum', | ||
42 | expr: ||| | ||
43 | sum by (instance) ( | ||
44 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s} | ||
45 | ) | 36 | ) |
46 | ||| % $._config, | 37 | ||| % $._config, |
47 | }, | 38 | }, |
@@ -57,9 +48,9 @@ | |||
57 | ||| % $._config, | 48 | ||| % $._config, |
58 | }, | 49 | }, |
59 | { | 50 | { |
60 | record: 'instance:node_memory_swap_io_bytes:sum_rate', | 51 | record: 'instance:node_memory_swap_io_pages:sum_rate', |
61 | expr: ||| | 52 | expr: ||| |
62 | 1e3 * sum by (instance) ( | 53 | ( |
63 | rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) | 54 | rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) |
64 | + | 55 | + |
65 | rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) | 56 | rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) |
@@ -70,7 +61,7 @@ | |||
70 | // Disk utilisation (ms spent, 1 second irate()) | 61 | // Disk utilisation (ms spent, 1 second irate()) |
71 | record: 'instance:node_disk_utilisation:sum_irate', | 62 | record: 'instance:node_disk_utilisation:sum_irate', |
72 | expr: ||| | 63 | expr: ||| |
73 | sum by (instance) ( | 64 | sum without (device) ( |
74 | irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) | 65 | irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) |
75 | ) | 66 | ) |
76 | ||| % $._config, | 67 | ||| % $._config, |
@@ -79,28 +70,30 @@ | |||
79 | // Disk saturation (ms spent, by rate() it's bound by 1 second) | 70 | // Disk saturation (ms spent, by rate() it's bound by 1 second) |
80 | record: 'instance:node_disk_saturation:sum_irate', | 71 | record: 'instance:node_disk_saturation:sum_irate', |
81 | expr: ||| | 72 | expr: ||| |
82 | sum by (instance) ( | 73 | sum without (device) ( |
83 | irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) | 74 | irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) |
84 | ) | 75 | ) |
85 | ||| % $._config, | 76 | ||| % $._config, |
86 | }, | 77 | }, |
78 | // TODO: For the following two rules, consider configurable filtering to exclude more network | ||
79 | // device names than just "lo". | ||
87 | { | 80 | { |
88 | record: 'instance:node_net_utilisation:sum_irate', | 81 | record: 'instance:node_net_utilisation:sum_irate', |
89 | expr: ||| | 82 | expr: ||| |
90 | sum by (instance) ( | 83 | sum without (device) ( |
91 | irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) | 84 | irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) |
92 | + | 85 | + |
93 | irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) | 86 | irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) |
94 | ) | 87 | ) |
95 | ||| % $._config, | 88 | ||| % $._config, |
96 | }, | 89 | }, |
97 | { | 90 | { |
98 | record: 'instance:node_net_saturation:sum_irate', | 91 | record: 'instance:node_net_saturation:sum_irate', |
99 | expr: ||| | 92 | expr: ||| |
100 | sum by (instance) ( | 93 | sum without (device) ( |
101 | irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) | 94 | irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) |
102 | + | 95 | + |
103 | irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) | 96 | irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) |
104 | ) | 97 | ) |
105 | ||| % $._config, | 98 | ||| % $._config, |
106 | }, | 99 | }, |