diff options
author | beorn7 <beorn@grafana.com> | 2019-07-17 23:54:31 +0200 |
---|---|---|
committer | beorn7 <beorn@grafana.com> | 2019-07-17 23:54:31 +0200 |
commit | 706511a49598db1c256a85b2b7dec4e6d754cabd (patch) | |
tree | 0b02af19a790389665117fa825006c6dcc6b2ce4 /docs | |
parent | 3a770a0b1d988cc81fc1d1c25f994d3de4cb0af7 (diff) | |
download | prometheus_node_collector-706511a49598db1c256a85b2b7dec4e6d754cabd.tar.bz2 prometheus_node_collector-706511a49598db1c256a85b2b7dec4e6d754cabd.tar.xz prometheus_node_collector-706511a49598db1c256a85b2b7dec4e6d754cabd.zip |
Responses to review comments, round 3
Signed-off-by: beorn7 <beorn@grafana.com>
Diffstat (limited to 'docs')
-rw-r--r-- | docs/node-mixin/config.libsonnet | 15 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/node.libsonnet | 18 | ||||
-rw-r--r-- | docs/node-mixin/dashboards/use.libsonnet | 76 | ||||
-rw-r--r-- | docs/node-mixin/rules/rules.libsonnet | 15 |
4 files changed, 81 insertions, 43 deletions
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 701d9be..95070ca 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet | |||
@@ -5,12 +5,17 @@ | |||
5 | // Select the metrics coming from the node exporter. | 5 | // Select the metrics coming from the node exporter. |
6 | nodeExporterSelector: 'job="node"', | 6 | nodeExporterSelector: 'job="node"', |
7 | 7 | ||
8 | // Select the fstype for filesystem-related queries. | 8 | // Select the fstype for filesystem-related queries. If left |
9 | // TODO: What is a good default selector here? | 9 | // empty, all filesystems are selected. If you have unusual |
10 | fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"', | 10 | // filesystem you don't want to include in dashboards and |
11 | // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. | ||
12 | fsSelector: '', | ||
11 | 13 | ||
12 | // Select the device for disk-related queries. | 14 | // Select the device for disk-related queries. If left empty, all |
13 | diskDeviceSelector: 'device=~"(sd|xvd).+"', | 15 | // devices are selected. If you have unusual devices you don't |
16 | // want to include in dashboards and alerting, you can exclude | ||
17 | // them here, e.g. 'device!="tmpfs"'. | ||
18 | diskDeviceSelector: '', | ||
14 | 19 | ||
15 | grafana_prefix: '', | 20 | grafana_prefix: '', |
16 | }, | 21 | }, |
diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 915cbe4..c3c97f3 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet | |||
@@ -22,7 +22,7 @@ local gauge = promgrafonnet.gauge; | |||
22 | .addTarget(prometheus.target( | 22 | .addTarget(prometheus.target( |
23 | // TODO: Consider using `${__interval}` as range and a 1m min step. | 23 | // TODO: Consider using `${__interval}` as range and a 1m min step. |
24 | ||| | 24 | ||| |
25 | 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) | 25 | 1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]) |
26 | ||| % $._config, | 26 | ||| % $._config, |
27 | legendFormat='{{cpu}}', | 27 | legendFormat='{{cpu}}', |
28 | intervalFactor=10, | 28 | intervalFactor=10, |
@@ -64,15 +64,18 @@ local gauge = promgrafonnet.gauge; | |||
64 | .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) | 64 | .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) |
65 | .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); | 65 | .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); |
66 | 66 | ||
67 | // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. | ||
68 | // This needs to be added upstream in the promgrafonnet library and then changed here. | ||
67 | local memoryGauge = gauge.new( | 69 | local memoryGauge = gauge.new( |
68 | 'Memory Usage', | 70 | 'Memory Usage', |
69 | ||| | 71 | ||| |
72 | 100 - | ||
70 | ( | 73 | ( |
71 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} | 74 | node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} |
72 | / | 75 | / |
73 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} | 76 | node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} |
74 | ) | ||
75 | * 100 | 77 | * 100 |
78 | ) | ||
76 | ||| % $._config, | 79 | ||| % $._config, |
77 | ).withLowerBeingBetter(); | 80 | ).withLowerBeingBetter(); |
78 | 81 | ||
@@ -82,10 +85,11 @@ local gauge = promgrafonnet.gauge; | |||
82 | datasource='$datasource', | 85 | datasource='$datasource', |
83 | span=9, | 86 | span=9, |
84 | ) | 87 | ) |
88 | // TODO: Does it make sense to have those three in the same panel? | ||
85 | // TODO: Consider using `${__interval}` as range and a 1m min step. | 89 | // TODO: Consider using `${__interval}` as range and a 1m min step. |
86 | .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read')) | 90 | .addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read')) |
87 | .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written')) | 91 | .addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written')) |
88 | .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) + | 92 | .addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) + |
89 | { | 93 | { |
90 | seriesOverrides: [ | 94 | seriesOverrides: [ |
91 | { | 95 | { |
@@ -103,6 +107,8 @@ local gauge = promgrafonnet.gauge; | |||
103 | ], | 107 | ], |
104 | }; | 108 | }; |
105 | 109 | ||
110 | // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. | ||
111 | // This needs to be added upstream in the promgrafonnet library and then changed here. | ||
106 | // TODO: Should this be partitioned by mountpoint? | 112 | // TODO: Should this be partitioned by mountpoint? |
107 | local diskSpaceUsage = gauge.new( | 113 | local diskSpaceUsage = gauge.new( |
108 | 'Disk Space Usage', | 114 | 'Disk Space Usage', |
@@ -158,7 +164,7 @@ local gauge = promgrafonnet.gauge; | |||
158 | template.new( | 164 | template.new( |
159 | 'instance', | 165 | 'instance', |
160 | '$datasource', | 166 | '$datasource', |
161 | 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, | 167 | 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, |
162 | refresh='time', | 168 | refresh='time', |
163 | ) | 169 | ) |
164 | ) | 170 | ) |
diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 533f392..e3739ac 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet | |||
@@ -12,13 +12,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
12 | g.panel('CPU Utilisation') + | 12 | g.panel('CPU Utilisation') + |
13 | g.queryPanel(||| | 13 | g.queryPanel(||| |
14 | ( | 14 | ( |
15 | instance:node_cpu_utilisation:avg_rate1m | 15 | instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s} |
16 | * | 16 | * |
17 | instance:node_num_cpu:sum | 17 | instance:node_num_cpu:sum{%(nodeExporterSelector)s} |
18 | / ignoring (instance) group_left | 18 | / ignoring (instance) group_left |
19 | sum without (instance) (instance:node_num_cpu:sum) | 19 | sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s}) |
20 | ) | 20 | ) |
21 | |||, '{{instance}}', legendLink) + | 21 | ||| % $._config, '{{instance}}', legendLink) + |
22 | g.stack + | 22 | g.stack + |
23 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 23 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
24 | ) | 24 | ) |
@@ -27,11 +27,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
27 | g.panel('CPU Saturation (load1 per CPU)') + | 27 | g.panel('CPU Saturation (load1 per CPU)') + |
28 | g.queryPanel(||| | 28 | g.queryPanel(||| |
29 | ( | 29 | ( |
30 | instance:node_load1_per_cpu:ratio | 30 | instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} |
31 | / ignoring (instance) group_left | 31 | / ignoring (instance) group_left |
32 | count without (instance) (instance:node_load1_per_cpu:ratio) | 32 | count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}) |
33 | ) | 33 | ) |
34 | |||, '{{instance}}', legendLink) + | 34 | ||| % $._config, '{{instance}}', legendLink) + |
35 | g.stack + | 35 | g.stack + |
36 | // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. | 36 | // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. |
37 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 37 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
@@ -41,13 +41,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
41 | g.row('Memory') | 41 | g.row('Memory') |
42 | .addPanel( | 42 | .addPanel( |
43 | g.panel('Memory Utilisation') + | 43 | g.panel('Memory Utilisation') + |
44 | g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + | 44 | g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + |
45 | g.stack + | 45 | g.stack + |
46 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 46 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
47 | ) | 47 | ) |
48 | .addPanel( | 48 | .addPanel( |
49 | g.panel('Memory Saturation (Swap I/O)') + | 49 | g.panel('Memory Saturation (Swap I/O)') + |
50 | g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + | 50 | g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + |
51 | g.stack + | 51 | g.stack + |
52 | { yaxes: g.yaxes('Bps') }, | 52 | { yaxes: g.yaxes('Bps') }, |
53 | ) | 53 | ) |
@@ -60,11 +60,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
60 | // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. | 60 | // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. |
61 | g.queryPanel(||| | 61 | g.queryPanel(||| |
62 | ( | 62 | ( |
63 | instance:node_disk_io_time:sum_rate1m | 63 | instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s} |
64 | / ignoring (instance) group_left | 64 | / ignoring (instance) group_left |
65 | count without (instance) (instance:node_disk_io_time:sum_rate1m) | 65 | count without (instance) (instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s}) |
66 | ) | 66 | ) |
67 | |||, '{{instance}}', legendLink) + | 67 | ||| % $._config, '{{instance}}', legendLink) + |
68 | g.stack + | 68 | g.stack + |
69 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 69 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
70 | ) | 70 | ) |
@@ -72,11 +72,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
72 | g.panel('Disk IO Saturation') + | 72 | g.panel('Disk IO Saturation') + |
73 | g.queryPanel(||| | 73 | g.queryPanel(||| |
74 | ( | 74 | ( |
75 | instance:node_disk_io_time_weighted:sum_rate1m | 75 | instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s} |
76 | / ignoring (instance) group_left | 76 | / ignoring (instance) group_left |
77 | count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m) | 77 | count without (instance) (instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s}) |
78 | ) | 78 | ) |
79 | |||, '{{instance}}', legendLink) + | 79 | ||| % $._config, '{{instance}}', legendLink) + |
80 | g.stack + | 80 | g.stack + |
81 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, | 81 | { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
82 | ) | 82 | ) |
@@ -84,16 +84,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
84 | .addRow( | 84 | .addRow( |
85 | g.row('Network') | 85 | g.row('Network') |
86 | .addPanel( | 86 | .addPanel( |
87 | g.panel('Net Utilisation (Transmitted)') + | 87 | g.panel('Net Utilisation (Bytes Receive/Transmit)') + |
88 | g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + | 88 | g.queryPanel( |
89 | [ | ||
90 | 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, | ||
91 | '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, | ||
92 | ], | ||
93 | ['{{instance}} Receive', '{{instance}} Transmit'], | ||
94 | legendLink, | ||
95 | ) + | ||
89 | g.stack + | 96 | g.stack + |
90 | { yaxes: g.yaxes('Bps') }, | 97 | { yaxes: g.yaxes('Bps') }, |
91 | ) | 98 | ) |
92 | .addPanel( | 99 | .addPanel( |
93 | g.panel('Net Saturation (Dropped)') + | 100 | g.panel('Net Saturation (Drops Receive/Transmit)') + |
94 | g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + | 101 | g.queryPanel( |
102 | [ | ||
103 | 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, | ||
104 | '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, | ||
105 | ], | ||
106 | ['{{instance}} Receive', '{{instance}} Transmit'], | ||
107 | legendLink, | ||
108 | ) + | ||
95 | g.stack + | 109 | g.stack + |
96 | { yaxes: g.yaxes('Bps') }, | 110 | { yaxes: g.yaxes('rps') }, |
97 | ) | 111 | ) |
98 | ) | 112 | ) |
99 | .addRow( | 113 | .addRow( |
@@ -127,12 +141,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
127 | g.row('CPU') | 141 | g.row('CPU') |
128 | .addPanel( | 142 | .addPanel( |
129 | g.panel('CPU Utilisation') + | 143 | g.panel('CPU Utilisation') + |
130 | g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') + | 144 | g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + |
131 | { yaxes: g.yaxes('percentunit') }, | 145 | { yaxes: g.yaxes('percentunit') }, |
132 | ) | 146 | ) |
133 | .addPanel( | 147 | .addPanel( |
134 | g.panel('CPU Saturation (Load1)') + | 148 | g.panel('CPU Saturation (Load1)') + |
135 | g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + | 149 | g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + |
136 | { yaxes: g.yaxes('percentunit') }, | 150 | { yaxes: g.yaxes('percentunit') }, |
137 | ) | 151 | ) |
138 | ) | 152 | ) |
@@ -140,12 +154,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
140 | g.row('Memory') | 154 | g.row('Memory') |
141 | .addPanel( | 155 | .addPanel( |
142 | g.panel('Memory Utilisation') + | 156 | g.panel('Memory Utilisation') + |
143 | g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + | 157 | g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + |
144 | { yaxes: g.yaxes('percentunit') }, | 158 | { yaxes: g.yaxes('percentunit') }, |
145 | ) | 159 | ) |
146 | .addPanel( | 160 | .addPanel( |
147 | g.panel('Memory Saturation (pages swapped per second)') + | 161 | g.panel('Memory Saturation (pages swapped per second)') + |
148 | g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') + | 162 | g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') + |
149 | { yaxes: g.yaxes('short') }, | 163 | { yaxes: g.yaxes('short') }, |
150 | ) | 164 | ) |
151 | ) | 165 | ) |
@@ -153,12 +167,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
153 | g.row('Disk') | 167 | g.row('Disk') |
154 | .addPanel( | 168 | .addPanel( |
155 | g.panel('Disk IO Utilisation') + | 169 | g.panel('Disk IO Utilisation') + |
156 | g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') + | 170 | g.queryPanel('instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + |
157 | { yaxes: g.yaxes('percentunit') }, | 171 | { yaxes: g.yaxes('percentunit') }, |
158 | ) | 172 | ) |
159 | .addPanel( | 173 | .addPanel( |
160 | g.panel('Disk IO Saturation') + | 174 | g.panel('Disk IO Saturation') + |
161 | g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') + | 175 | g.queryPanel('instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + |
162 | { yaxes: g.yaxes('percentunit') }, | 176 | { yaxes: g.yaxes('percentunit') }, |
163 | ) | 177 | ) |
164 | ) | 178 | ) |
@@ -167,7 +181,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
167 | .addPanel( | 181 | .addPanel( |
168 | g.panel('Net Utilisation (Bytes Receive/Transmit)') + | 182 | g.panel('Net Utilisation (Bytes Receive/Transmit)') + |
169 | g.queryPanel( | 183 | g.queryPanel( |
170 | ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'], | 184 | [ |
185 | 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, | ||
186 | '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, | ||
187 | ], | ||
171 | ['Receive', 'Transmit'], | 188 | ['Receive', 'Transmit'], |
172 | ) + | 189 | ) + |
173 | { yaxes: g.yaxes('Bps') }, | 190 | { yaxes: g.yaxes('Bps') }, |
@@ -175,7 +192,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; | |||
175 | .addPanel( | 192 | .addPanel( |
176 | g.panel('Net Saturation (Drops Receive/Transmit)') + | 193 | g.panel('Net Saturation (Drops Receive/Transmit)') + |
177 | g.queryPanel( | 194 | g.queryPanel( |
178 | ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'], | 195 | [ |
196 | 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, | ||
197 | '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, | ||
198 | ], | ||
179 | ['Receive drops', 'Transmit drops'], | 199 | ['Receive drops', 'Transmit drops'], |
180 | ) + | 200 | ) + |
181 | { yaxes: g.yaxes('rps') }, | 201 | { yaxes: g.yaxes('rps') }, |
diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 5422f44..d8c0fae 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet | |||
@@ -9,7 +9,7 @@ | |||
9 | record: 'instance:node_num_cpu:sum', | 9 | record: 'instance:node_num_cpu:sum', |
10 | expr: ||| | 10 | expr: ||| |
11 | count without (cpu) ( | 11 | count without (cpu) ( |
12 | sum without (mode) ( | 12 | count without (mode) ( |
13 | node_cpu_seconds_total{%(nodeExporterSelector)s} | 13 | node_cpu_seconds_total{%(nodeExporterSelector)s} |
14 | ) | 14 | ) |
15 | ) | 15 | ) |
@@ -26,7 +26,9 @@ | |||
26 | }, | 26 | }, |
27 | { | 27 | { |
28 | // This is CPU saturation: 1min avg run queue length / number of CPUs. | 28 | // This is CPU saturation: 1min avg run queue length / number of CPUs. |
29 | // Can go over 1. >1 is bad. | 29 | // Can go over 1. |
30 | // TODO: There are situation where a run queue >1/core is just normal and fine. | ||
31 | // We need to clarify how to lead this metric and if its usage is helpful at all. | ||
30 | record: 'instance:node_load1_per_cpu:ratio', | 32 | record: 'instance:node_load1_per_cpu:ratio', |
31 | expr: ||| | 33 | expr: ||| |
32 | ( | 34 | ( |
@@ -59,7 +61,9 @@ | |||
59 | }, | 61 | }, |
60 | { | 62 | { |
61 | // Disk utilisation (seconds spent, 1 second rate) | 63 | // Disk utilisation (seconds spent, 1 second rate) |
62 | record: 'instance:node_disk_io_time:sum_rate1m', | 64 | // TODO: This should probably not aggregate over all devices but |
65 | // keep them separate. | ||
66 | record: 'instance:node_disk_io_time_seconds:sum_rate1m', | ||
63 | expr: ||| | 67 | expr: ||| |
64 | sum without (device) ( | 68 | sum without (device) ( |
65 | rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) | 69 | rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) |
@@ -68,7 +72,9 @@ | |||
68 | }, | 72 | }, |
69 | { | 73 | { |
70 | // Disk saturation (weighted seconds spent, 1 second rate) | 74 | // Disk saturation (weighted seconds spent, 1 second rate) |
71 | record: 'instance:node_disk_io_time_weighted:sum_rate1m', | 75 | // TODO: This should probably not aggregate over all devices but |
76 | // keep them separate. | ||
77 | record: 'instance:node_disk_io_time_weighted_seconds:sum_rate1m', | ||
72 | expr: ||| | 78 | expr: ||| |
73 | sum without (device) ( | 79 | sum without (device) ( |
74 | rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) | 80 | rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) |
@@ -93,6 +99,7 @@ | |||
93 | ) | 99 | ) |
94 | ||| % $._config, | 100 | ||| % $._config, |
95 | }, | 101 | }, |
102 | // TODO: Find out if those drops ever happen on modern switched networks. | ||
96 | { | 103 | { |
97 | record: 'instance:node_network_receive_drop:sum_rate1m', | 104 | record: 'instance:node_network_receive_drop:sum_rate1m', |
98 | expr: ||| | 105 | expr: ||| |