aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorbeorn7 <beorn@grafana.com>2019-07-16 21:18:17 +0200
committerbeorn7 <beorn@grafana.com>2019-07-16 21:18:17 +0200
commita92d1d7889ddcbaad50e821cb155795bf3e9758a (patch)
treec503d6de51e92659ccb9d956dd5131a21f15cb39 /docs
parent3ab1f41d12d55e1561bab58bc6f0ef1604c5dd65 (diff)
downloadprometheus_node_collector-a92d1d7889ddcbaad50e821cb155795bf3e9758a.tar.bz2
prometheus_node_collector-a92d1d7889ddcbaad50e821cb155795bf3e9758a.tar.xz
prometheus_node_collector-a92d1d7889ddcbaad50e821cb155795bf3e9758a.zip
Address review comments, batch 2
Signed-off-by: beorn7 <beorn@grafana.com>
Diffstat (limited to 'docs')
-rw-r--r--docs/node-mixin/alerts/alerts.libsonnet12
-rw-r--r--docs/node-mixin/config.libsonnet5
-rw-r--r--docs/node-mixin/dashboards/node.libsonnet16
-rw-r--r--docs/node-mixin/dashboards/use.libsonnet34
-rw-r--r--docs/node-mixin/rules/rules.libsonnet46
5 files changed, 68 insertions, 45 deletions
diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet
index 013a9ee..76bbb03 100644
--- a/docs/node-mixin/alerts/alerts.libsonnet
+++ b/docs/node-mixin/alerts/alerts.libsonnet
@@ -43,7 +43,7 @@
43 }, 43 },
44 }, 44 },
45 { 45 {
46 alert: 'NodeFilesystemOutOfSpace', 46 alert: 'NodeFilesystemAlmostOutOfSpace',
47 expr: ||| 47 expr: |||
48 ( 48 (
49 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 49 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
@@ -60,7 +60,7 @@
60 }, 60 },
61 }, 61 },
62 { 62 {
63 alert: 'NodeFilesystemOutOfSpace', 63 alert: 'NodeFilesystemAlmostOutOfSpace',
64 expr: ||| 64 expr: |||
65 ( 65 (
66 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 66 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
@@ -115,7 +115,7 @@
115 }, 115 },
116 }, 116 },
117 { 117 {
118 alert: 'NodeFilesystemOutOfFiles', 118 alert: 'NodeFilesystemAlmostOutOfFiles',
119 expr: ||| 119 expr: |||
120 ( 120 (
121 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 121 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
@@ -132,7 +132,7 @@
132 }, 132 },
133 }, 133 },
134 { 134 {
135 alert: 'NodeFilesystemOutOfSpace', 135 alert: 'NodeFilesystemAlmostOutOfFiles',
136 expr: ||| 136 expr: |||
137 ( 137 (
138 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 138 node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
@@ -155,7 +155,7 @@
155 ||| % $._config, 155 ||| % $._config,
156 'for': '1h', 156 'for': '1h',
157 labels: { 157 labels: {
158 severity: 'critical', 158 severity: 'warning',
159 }, 159 },
160 annotations: { 160 annotations: {
161 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', 161 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
@@ -168,7 +168,7 @@
168 ||| % $._config, 168 ||| % $._config,
169 'for': '1h', 169 'for': '1h',
170 labels: { 170 labels: {
171 severity: 'critical', 171 severity: 'warning',
172 }, 172 },
173 annotations: { 173 annotations: {
174 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', 174 message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet
index de84b9e..701d9be 100644
--- a/docs/node-mixin/config.libsonnet
+++ b/docs/node-mixin/config.libsonnet
@@ -3,10 +3,11 @@
3 // Selectors are inserted between {} in Prometheus queries. 3 // Selectors are inserted between {} in Prometheus queries.
4 4
5 // Select the metrics coming from the node exporter. 5 // Select the metrics coming from the node exporter.
6 nodeExporterSelector: 'job="node-exporter"', 6 nodeExporterSelector: 'job="node"',
7 7
8 // Select the fstype for filesystem-related queries. 8 // Select the fstype for filesystem-related queries.
9 fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', 9 // TODO: What is a good default selector here?
10 fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"',
10 11
11 // Select the device for disk-related queries. 12 // Select the device for disk-related queries.
12 diskDeviceSelector: 'device=~"(sd|xvd).+"', 13 diskDeviceSelector: 'device=~"(sd|xvd).+"',
diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet
index 040d60a..915cbe4 100644
--- a/docs/node-mixin/dashboards/node.libsonnet
+++ b/docs/node-mixin/dashboards/node.libsonnet
@@ -20,8 +20,9 @@ local gauge = promgrafonnet.gauge;
20 min=0, 20 min=0,
21 ) 21 )
22 .addTarget(prometheus.target( 22 .addTarget(prometheus.target(
23 // TODO: Consider using `${__interval}` as range and a 1m min step.
23 ||| 24 |||
24 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) 25 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
25 ||| % $._config, 26 ||| % $._config,
26 legendFormat='{{cpu}}', 27 legendFormat='{{cpu}}',
27 intervalFactor=10, 28 intervalFactor=10,
@@ -81,9 +82,10 @@ local gauge = promgrafonnet.gauge;
81 datasource='$datasource', 82 datasource='$datasource',
82 span=9, 83 span=9,
83 ) 84 )
84 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) 85 // TODO: Consider using `${__interval}` as range and a 1m min step.
85 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) 86 .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read'))
86 .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + 87 .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written'))
88 .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) +
87 { 89 {
88 seriesOverrides: [ 90 seriesOverrides: [
89 { 91 {
@@ -122,7 +124,8 @@ local gauge = promgrafonnet.gauge;
122 span=6, 124 span=6,
123 format='bytes', 125 format='bytes',
124 ) 126 )
125 .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); 127 // TODO: Consider using `${__interval}` as range and a 1m min step.
128 .addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
126 129
127 local networkTransmitted = 130 local networkTransmitted =
128 graphPanel.new( 131 graphPanel.new(
@@ -131,7 +134,8 @@ local gauge = promgrafonnet.gauge;
131 span=6, 134 span=6,
132 format='bytes', 135 format='bytes',
133 ) 136 )
134 .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); 137 // TODO: Consider using `${__interval}` as range and a 1m min step.
138 .addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
135 139
136 dashboard.new('Nodes', time_from='now-1h') 140 dashboard.new('Nodes', time_from='now-1h')
137 .addTemplate( 141 .addTemplate(
diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet
index 115e893..533f392 100644
--- a/docs/node-mixin/dashboards/use.libsonnet
+++ b/docs/node-mixin/dashboards/use.libsonnet
@@ -12,7 +12,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
12 g.panel('CPU Utilisation') + 12 g.panel('CPU Utilisation') +
13 g.queryPanel(||| 13 g.queryPanel(|||
14 ( 14 (
15 instance:node_cpu_utilisation:avg1m 15 instance:node_cpu_utilisation:avg_rate1m
16 * 16 *
17 instance:node_num_cpu:sum 17 instance:node_num_cpu:sum
18 / ignoring (instance) group_left 18 / ignoring (instance) group_left
@@ -60,9 +60,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
60 // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. 60 // 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
61 g.queryPanel(||| 61 g.queryPanel(|||
62 ( 62 (
63 instance:node_disk_utilisation:sum_irate 63 instance:node_disk_io_time:sum_rate1m
64 / ignoring (instance) group_left 64 / ignoring (instance) group_left
65 count without (instance) (instance:node_disk_utilisation:sum_irate) 65 count without (instance) (instance:node_disk_io_time:sum_rate1m)
66 ) 66 )
67 |||, '{{instance}}', legendLink) + 67 |||, '{{instance}}', legendLink) +
68 g.stack + 68 g.stack +
@@ -72,9 +72,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
72 g.panel('Disk IO Saturation') + 72 g.panel('Disk IO Saturation') +
73 g.queryPanel(||| 73 g.queryPanel(|||
74 ( 74 (
75 instance:node_disk_saturation:sum_irate 75 instance:node_disk_io_time_weighted:sum_rate1m
76 / ignoring (instance) group_left 76 / ignoring (instance) group_left
77 count without (instance) (instance:node_disk_saturation:sum_irate) 77 count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m)
78 ) 78 )
79 |||, '{{instance}}', legendLink) + 79 |||, '{{instance}}', legendLink) +
80 g.stack + 80 g.stack +
@@ -127,7 +127,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
127 g.row('CPU') 127 g.row('CPU')
128 .addPanel( 128 .addPanel(
129 g.panel('CPU Utilisation') + 129 g.panel('CPU Utilisation') +
130 g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + 130 g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') +
131 { yaxes: g.yaxes('percentunit') }, 131 { yaxes: g.yaxes('percentunit') },
132 ) 132 )
133 .addPanel( 133 .addPanel(
@@ -145,7 +145,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
145 ) 145 )
146 .addPanel( 146 .addPanel(
147 g.panel('Memory Saturation (pages swapped per second)') + 147 g.panel('Memory Saturation (pages swapped per second)') +
148 g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + 148 g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') +
149 { yaxes: g.yaxes('short') }, 149 { yaxes: g.yaxes('short') },
150 ) 150 )
151 ) 151 )
@@ -153,26 +153,32 @@ local g = import 'grafana-builder/grafana.libsonnet';
153 g.row('Disk') 153 g.row('Disk')
154 .addPanel( 154 .addPanel(
155 g.panel('Disk IO Utilisation') + 155 g.panel('Disk IO Utilisation') +
156 g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + 156 g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') +
157 { yaxes: g.yaxes('percentunit') }, 157 { yaxes: g.yaxes('percentunit') },
158 ) 158 )
159 .addPanel( 159 .addPanel(
160 g.panel('Disk IO Saturation') + 160 g.panel('Disk IO Saturation') +
161 g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + 161 g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') +
162 { yaxes: g.yaxes('percentunit') }, 162 { yaxes: g.yaxes('percentunit') },
163 ) 163 )
164 ) 164 )
165 .addRow( 165 .addRow(
166 g.row('Net') 166 g.row('Net')
167 .addPanel( 167 .addPanel(
168 g.panel('Net Utilisation (Transmitted)') + 168 g.panel('Net Utilisation (Bytes Receive/Transmit)') +
169 g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + 169 g.queryPanel(
170 ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'],
171 ['Receive', 'Transmit'],
172 ) +
170 { yaxes: g.yaxes('Bps') }, 173 { yaxes: g.yaxes('Bps') },
171 ) 174 )
172 .addPanel( 175 .addPanel(
173 g.panel('Net Saturation (Dropped)') + 176 g.panel('Net Saturation (Drops Receive/Transmit)') +
174 g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + 177 g.queryPanel(
175 { yaxes: g.yaxes('Bps') }, 178 ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'],
179 ['Receive drops', 'Transmit drops'],
180 ) +
181 { yaxes: g.yaxes('rps') },
176 ) 182 )
177 ) 183 )
178 .addRow( 184 .addRow(
diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet
index c4bc31a..5422f44 100644
--- a/docs/node-mixin/rules/rules.libsonnet
+++ b/docs/node-mixin/rules/rules.libsonnet
@@ -17,7 +17,7 @@
17 }, 17 },
18 { 18 {
19 // CPU utilisation is % CPU is not idle. 19 // CPU utilisation is % CPU is not idle.
20 record: 'instance:node_cpu_utilisation:avg1m', 20 record: 'instance:node_cpu_utilisation:avg_rate1m',
21 expr: ||| 21 expr: |||
22 1 - avg without (cpu, mode) ( 22 1 - avg without (cpu, mode) (
23 rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) 23 rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
@@ -48,7 +48,7 @@
48 ||| % $._config, 48 ||| % $._config,
49 }, 49 },
50 { 50 {
51 record: 'instance:node_memory_swap_io_pages:sum_rate', 51 record: 'instance:node_memory_swap_io_pages:rate1m',
52 expr: ||| 52 expr: |||
53 ( 53 (
54 rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) 54 rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
@@ -58,42 +58,54 @@
58 ||| % $._config, 58 ||| % $._config,
59 }, 59 },
60 { 60 {
61 // Disk utilisation (ms spent, 1 second irate()) 61 // Disk utilisation (seconds spent, 1 second rate)
62 record: 'instance:node_disk_utilisation:sum_irate', 62 record: 'instance:node_disk_io_time:sum_rate1m',
63 expr: ||| 63 expr: |||
64 sum without (device) ( 64 sum without (device) (
65 irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) 65 rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
66 ) 66 )
67 ||| % $._config, 67 ||| % $._config,
68 }, 68 },
69 { 69 {
70 // Disk saturation (ms spent, by rate() it's bound by 1 second) 70 // Disk saturation (weighted seconds spent, 1 second rate)
71 record: 'instance:node_disk_saturation:sum_irate', 71 record: 'instance:node_disk_io_time_weighted:sum_rate1m',
72 expr: ||| 72 expr: |||
73 sum without (device) ( 73 sum without (device) (
74 irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) 74 rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
75 ) 75 )
76 ||| % $._config, 76 ||| % $._config,
77 }, 77 },
78 // TODO: For the following two rules, consider configurable filtering to exclude more network 78 // TODO: For the following rules, consider configurable filtering to exclude more network
79 // device names than just "lo". 79 // device names than just "lo".
80 { 80 {
81 record: 'instance:node_net_utilisation:sum_irate', 81 record: 'instance:node_network_receive_bytes:sum_rate1m',
82 expr: ||| 82 expr: |||
83 sum without (device) ( 83 sum without (device) (
84 irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) 84 rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
85 +
86 irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
87 ) 85 )
88 ||| % $._config, 86 ||| % $._config,
89 }, 87 },
90 { 88 {
91 record: 'instance:node_net_saturation:sum_irate', 89 record: 'instance:node_network_transmit_bytes:sum_rate1m',
92 expr: ||| 90 expr: |||
93 sum without (device) ( 91 sum without (device) (
94 irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) 92 rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
95 + 93 )
96 irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) 94 ||| % $._config,
95 },
96 {
97 record: 'instance:node_network_receive_drop:sum_rate1m',
98 expr: |||
99 sum without (device) (
100 rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
101 )
102 ||| % $._config,
103 },
104 {
105 record: 'instance:node_network_transmit_drop:sum_rate1m',
106 expr: |||
107 sum without (device) (
108 rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
97 ) 109 )
98 ||| % $._config, 110 ||| % $._config,
99 }, 111 },