aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorbeorn7 <beorn@grafana.com>2019-08-14 22:24:24 +0200
committerbeorn7 <beorn@grafana.com>2019-08-14 22:24:24 +0200
commit97ef11376219a1e3ee2c5f21f105bdeb26ef43d0 (patch)
tree470eba513f8c6c837ea932f8701d79aca54b0643 /docs
parent697c2deed59b414e73197e537192ef320533ceb3 (diff)
downloadprometheus_node_collector-97ef11376219a1e3ee2c5f21f105bdeb26ef43d0.tar.bz2
prometheus_node_collector-97ef11376219a1e3ee2c5f21f105bdeb26ef43d0.tar.xz
prometheus_node_collector-97ef11376219a1e3ee2c5f21f105bdeb26ef43d0.zip
Make the severity of "critical" alerts configurable
This addresses the blissful scenario where single-node failures are unproblematic. No reason to wake somebody up if a node is about to screw itself up by filling the disk. Signed-off-by: beorn7 <beorn@grafana.com>
Diffstat (limited to 'docs')
-rw-r--r--docs/node-mixin/alerts/alerts.libsonnet8
-rw-r--r--docs/node-mixin/config.libsonnet13
2 files changed, 17 insertions, 4 deletions
diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet
index 7b9fb89..4423f89 100644
--- a/docs/node-mixin/alerts/alerts.libsonnet
+++ b/docs/node-mixin/alerts/alerts.libsonnet
@@ -37,7 +37,7 @@
37 ||| % $._config, 37 ||| % $._config,
38 'for': '1h', 38 'for': '1h',
39 labels: { 39 labels: {
40 severity: 'critical', 40 severity: '%(nodeCriticalSeverity)s' % $._config,
41 }, 41 },
42 annotations: { 42 annotations: {
43 summary: 'Filesystem is predicted to run out of space within the next 4 hours.', 43 summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
@@ -73,7 +73,7 @@
73 ||| % $._config, 73 ||| % $._config,
74 'for': '1h', 74 'for': '1h',
75 labels: { 75 labels: {
76 severity: 'critical', 76 severity: '%(nodeCriticalSeverity)s' % $._config,
77 }, 77 },
78 annotations: { 78 annotations: {
79 summary: 'Filesystem has less than 3% space left.', 79 summary: 'Filesystem has less than 3% space left.',
@@ -113,7 +113,7 @@
113 ||| % $._config, 113 ||| % $._config,
114 'for': '1h', 114 'for': '1h',
115 labels: { 115 labels: {
116 severity: 'critical', 116 severity: '%(nodeCriticalSeverity)s' % $._config,
117 }, 117 },
118 annotations: { 118 annotations: {
119 summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', 119 summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
@@ -149,7 +149,7 @@
149 ||| % $._config, 149 ||| % $._config,
150 'for': '1h', 150 'for': '1h',
151 labels: { 151 labels: {
152 severity: 'critical', 152 severity: '%(nodeCriticalSeverity)s' % $._config,
153 }, 153 },
154 annotations: { 154 annotations: {
155 summary: 'Filesystem has less than 3% inodes left.', 155 summary: 'Filesystem has less than 3% inodes left.',
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet
index 95070ca..8cf9860 100644
--- a/docs/node-mixin/config.libsonnet
+++ b/docs/node-mixin/config.libsonnet
@@ -17,6 +17,19 @@
17 // them here, e.g. 'device!="tmpfs"'. 17 // them here, e.g. 'device!="tmpfs"'.
18 diskDeviceSelector: '', 18 diskDeviceSelector: '',
19 19
20 // Some of the alerts are meant to fire if a critical failure of a
21 // node is imminent (e.g. the disk is about to run full). In a
22 // true “cloud native” setup, failures of a single node should be
23 // tolerated. Hence, even imminent failure of a single node is no
24 // reason to create a paging alert. However, in practice there are
25 // still many situations where operators like to get paged in time
26 // before a node runs out of disk space. nodeCriticalSeverity can
27 // be set to the desired severity for this kind of alerts. This
28 // can even be templated to depend on labels of the node, e.g. you
29 // could make this critical for traditional database masters but
30 // just a warning for K8s nodes.
31 nodeCriticalSeverity: 'critical',
32
20 grafana_prefix: '', 33 grafana_prefix: '',
21 }, 34 },
22} 35}