From 97ef11376219a1e3ee2c5f21f105bdeb26ef43d0 Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Wed, 14 Aug 2019 22:24:24 +0200
Subject: Make the severity of "critical" alerts configurable

This addresses the blissful scenario where single-node failures are
unproblematic. No reason to wake somebody up if a node is about to
screw itself up by filling the disk.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 docs/node-mixin/alerts/alerts.libsonnet |  8 ++++----
 docs/node-mixin/config.libsonnet        | 13 +++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'docs')

diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet
index 7b9fb89..4423f89 100644
--- a/docs/node-mixin/alerts/alerts.libsonnet
+++ b/docs/node-mixin/alerts/alerts.libsonnet
@@ -37,7 +37,7 @@
             ||| % $._config,
             'for': '1h',
             labels: {
-              severity: 'critical',
+              severity: '%(nodeCriticalSeverity)s' % $._config,
             },
             annotations: {
               summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
@@ -73,7 +73,7 @@
             ||| % $._config,
             'for': '1h',
             labels: {
-              severity: 'critical',
+              severity: '%(nodeCriticalSeverity)s' % $._config,
             },
             annotations: {
               summary: 'Filesystem has less than 3% space left.',
@@ -113,7 +113,7 @@
             ||| % $._config,
             'for': '1h',
             labels: {
-              severity: 'critical',
+              severity: '%(nodeCriticalSeverity)s' % $._config,
             },
             annotations: {
               summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
@@ -149,7 +149,7 @@
             ||| % $._config,
             'for': '1h',
             labels: {
-              severity: 'critical',
+              severity: '%(nodeCriticalSeverity)s' % $._config,
             },
             annotations: {
               summary: 'Filesystem has less than 3% inodes left.',
diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet
index 95070ca..8cf9860 100644
--- a/docs/node-mixin/config.libsonnet
+++ b/docs/node-mixin/config.libsonnet
@@ -17,6 +17,19 @@
     // them here, e.g. 'device!="tmpfs"'.
     diskDeviceSelector: '',
 
+    // Some of the alerts are meant to fire if a critical failure of a
+    // node is imminent (e.g. the disk is about to run full). In a
+    // true “cloud native” setup, failures of a single node should be
+    // tolerated. Hence, even imminent failure of a single node is no
+    // reason to create a paging alert. However, in practice there are
+    // still many situations where operators like to get paged in time
+    // before a node runs out of disk space. nodeCriticalSeverity can
+    // be set to the desired severity for this kind of alerts. This
+    // can even be templated to depend on labels of the node, e.g. you
+    // could make this critical for traditional database masters but
+    // just a warning for K8s nodes.
+    nodeCriticalSeverity: 'critical',
+
     grafana_prefix: '',
   },
 }
-- 
cgit v1.2.3