Merge pull request #53 from projectsyn/feature/additional_rules

Add addon to add additional rules
projectsyn · Sep 28, 2022 · 22f16d8 · 22f16d8
2 parents bdd1369 + f503992
commit 22f16d8
Show file tree

Hide file tree

Showing 85 changed files with 26,122 additions and 3 deletions.
diff --git a/.cruft.json b/.cruft.json
@@ -1,13 +1,13 @@
 {
   "template": "https://github.com/projectsyn/commodore-component-template.git",
-  "commit": "f104a5fd69856c0e8cb7b27860876bd2f6386cf6",
+  "commit": "32f6976d3a3028d3fdb34ea312454f2ea7fda4a2",
   "checkout": "main",
   "context": {
     "cookiecutter": {
       "name": "prometheus",
       "slug": "prometheus",
       "parameter_key": "prometheus",
-      "test_cases": "defaults kubernetes_1.20 kubernetes_1.21 kubernetes_1.22 multi openshift rewrite-registries thanos cluster-monitoring kubernetes_1.23 kubernetes_1.24",
+      "test_cases": "defaults kubernetes_1.20 kubernetes_1.21 kubernetes_1.22 multi openshift rewrite-registries thanos cluster-monitoring kubernetes_1.23 kubernetes_1.24 additional_rules",
       "add_lib": "y",
       "add_pp": "n",
       "add_golden": "y",

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -43,6 +43,7 @@ jobs:
           - cluster-monitoring
           - kubernetes_1.23
           - kubernetes_1.24
+          - additional_rules
     defaults:
       run:
         working-directory: ${{ env.COMPONENT_NAME }}
@@ -68,6 +69,7 @@ jobs:
           - cluster-monitoring
           - kubernetes_1.23
           - kubernetes_1.24
+          - additional_rules
     defaults:
       run:
         working-directory: ${{ env.COMPONENT_NAME }}

diff --git a/Makefile.vars.mk b/Makefile.vars.mk
@@ -58,4 +58,4 @@ KUBENT_IMAGE    ?= docker.io/projectsyn/kubent:latest
 KUBENT_DOCKER   ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)
 
 instance ?= defaults
-test_instances = tests/defaults.yml tests/kubernetes_1.20.yml tests/kubernetes_1.21.yml tests/kubernetes_1.22.yml tests/multi.yml tests/openshift.yml tests/rewrite-registries.yml tests/thanos.yml tests/cluster-monitoring.yml tests/kubernetes_1.23.yml tests/kubernetes_1.24.yml
+test_instances = tests/defaults.yml tests/kubernetes_1.20.yml tests/kubernetes_1.21.yml tests/kubernetes_1.22.yml tests/multi.yml tests/openshift.yml tests/rewrite-registries.yml tests/thanos.yml tests/cluster-monitoring.yml tests/kubernetes_1.23.yml tests/kubernetes_1.24.yml tests/additional_rules.yml
diff --git a/class/defaults.yml b/class/defaults.yml
@@ -36,6 +36,7 @@ parameters:
     addons:
       - disable-alerts
     addon_configs:
+      additional_rules: {}
       disable_alerts:
         # List of alertnames to exclude from the final ruleset
         ignoreNames:

diff --git a/component/addons/additional-rules.libsonnet b/component/addons/additional-rules.libsonnet
@@ -0,0 +1,31 @@
+local kap = import 'lib/kapitan.libjsonnet';
+local kube = import 'lib/kube.libjsonnet';
+local prom = import 'lib/prom.libsonnet';
+local inv = kap.inventory();
+local params = inv.parameters.prometheus;
+{
+  prometheus+: {
+    prometheusRule+: {
+      spec+: {
+        groups+: [
+          {
+            name: group_name,
+            rules: [
+              local rnamekey =
+                local k = std.splitLimit(rname, ':', 1);
+                assert std.member([ 'alert', 'record' ], k[0]) : 'Invalid custom rule key "%s", the component expects that custom rule keys are prefixed with either "alert:" or "record:"' % [ rname ];
+                k;
+              params.addon_configs.additional_rules[group_name][rname] {
+                [rnamekey[0]]: rnamekey[1],
+              }
+              for rname in std.objectFields(params.addon_configs.additional_rules[group_name])
+              if params.addon_configs.additional_rules[group_name][rname] != null
+            ],
+          }
+          for group_name in std.objectFields(params.addon_configs.additional_rules)
+          if params.addon_configs.additional_rules[group_name] != null
+        ],
+      },
+    },
+  },
+}
diff --git a/docs/modules/ROOT/pages/references/addon-additional-rules.adoc b/docs/modules/ROOT/pages/references/addon-additional-rules.adoc
@@ -0,0 +1,29 @@
+= Addon: Additional rules
+
+This addon allows users to configure additional Prometheus rules to deploy on the cluster via the parameter `addon_configs.additional_rules`.
+
+Each key-value pair in the dictionary is transformed into a Prometheus rule group by the component.
+
+The component expects that values are dicts themselves and expects that keys in those dicts are prefixed with `record:` or `alert:` to indicate whether the rule is a recording or alerting rule.
+The component will transform the keys into fields in the resulting rule by taking the prefix as the field name and the rest of the key as the field value.
+For example, key `"record:sum:some:metric:5m"` would be transformed into `record: sum:some:metric:5m` which should define a recording rule with name `sum:some:metric:5m`.
+This field is then merged into the provided value which should be a valid rule definition.
+
+See the Prometheus docs for supported configurations for https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording] and https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting] rules.
+
+
+Example:
+
+[source,yaml]
+---
+rules:
+  generic-rules:
+    "alert:ContainerOOMKilled":
+      annotations:
+        message: A container ({{$labels.container}}) in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOM killed
+      expr: |
+        kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
+      labels:
+        source: https://git.vshn.net/swisscompks/syn-tenant-repo/-/blob/master/common.yml
+        severity: devnull
+---
diff --git a/docs/modules/ROOT/partials/nav.adoc b/docs/modules/ROOT/partials/nav.adoc
@@ -4,6 +4,7 @@
 .Addons
 ** xref:references/addon-oauth2-proxy.adoc[OAuth2 Proxy]
 ** xref:references/addon-disable-alerts.adoc[Disable Alerts]
+** xref:references/addon-additional-rules.adoc[Additional Rules]
 
 .How-Tos
 * xref:how-tos/prometheus.adoc[Deploy Prometheus]

diff --git a/tests/additional_rules.yml b/tests/additional_rules.yml
@@ -0,0 +1,36 @@
+---
+parameters:
+  prometheus:
+    addons:
+      - additional-rules
+    addon_configs:
+      additional_rules:
+        generic-rules:
+          "alert:ContainerOOMKilled":
+            annotations:
+              message: "A container ({{$labels.container}}) in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOM killed"
+            expr: |
+              kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
+            labels:
+              severity: devnull
+    instances:
+      default-instance:
+        prometheus:
+          enabled: true
+        alertmanager:
+          enabled: true
+        # Disabled for some speedup since it's irrelevant for this test
+        grafana:
+          enabled: false
+        nodeExporter:
+          enabled: true
+        blackboxExporter:
+          enabled: true
+        kubernetesControlPlane:
+          enabled: true
+        prometheusAdapter:
+          enabled: true
+        kubeStateMetrics:
+          enabled: true
+        kubePrometheus:
+          enabled: true
diff --git a/tests/golden/additional_rules/prometheus/apps/prometheus.yaml b/tests/golden/additional_rules/prometheus/apps/prometheus.yaml
@@ -0,0 +1,6 @@
+spec:
+  ignoreDifferences:
+    - group: ''
+      jsonPointers:
+        - /imagePullSecrets
+      kind: ServiceAccount
diff --git a/tests/golden/additional_rules/prometheus/prometheus/00_operator_namespace.yaml b/tests/golden/additional_rules/prometheus/prometheus/00_operator_namespace.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  annotations: {}
+  labels:
+    SYNMonitoring: main
+    name: syn-prometheus-operator
+  name: syn-prometheus-operator
diff --git a/...ional_rules/prometheus/prometheus/100_default-instance_kubePrometheus_prometheusRule.yaml b/...ional_rules/prometheus/prometheus/100_default-instance_kubePrometheus_prometheusRule.yaml
@@ -0,0 +1,94 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  annotations:
+    source: https://github.com/projectsyn/component-prometheus
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/managed-by: commodore
+    app.kubernetes.io/name: kube-prometheus
+    app.kubernetes.io/part-of: kube-prometheus
+    monitoring.syn.tools/enabled: 'true'
+    prometheus: default-instance
+    role: alert-rules
+  name: kube-prometheus-rules
+  namespace: syn-prometheus
+spec:
+  groups:
+    - name: general.rules
+      rules:
+        - alert: TargetDown
+          annotations:
+            description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{
+              $labels.service }} targets in {{ $labels.namespace }} namespace are
+              down.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
+            summary: One or more targets are unreachable.
+          expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY
+            (job, namespace, service)) > 10
+          for: 10m
+          labels:
+            severity: warning
+        - alert: Watchdog
+          annotations:
+            description: 'This is an alert meant to ensure that the entire alerting
+              pipeline is functional.
+
+              This alert is always firing, therefore it should always be firing in
+              Alertmanager
+
+              and always fire against a receiver. There are integrations with various
+              notification
+
+              mechanisms that send a notification when this alert is not firing. For
+              example the
+
+              "DeadMansSnitch" integration in PagerDuty.
+
+              '
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
+            summary: An alert that should always be firing to certify that Alertmanager
+              is working properly.
+          expr: vector(1)
+          labels:
+            severity: none
+    - name: node-network
+      rules:
+        - alert: NodeNetworkInterfaceFlapping
+          annotations:
+            description: Network interface "{{ $labels.device }}" changing its up
+              status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
+              }}
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
+            summary: Network interface is often changing its status
+          expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
+            > 2
+
+            '
+          for: 2m
+          labels:
+            severity: warning
+    - name: kube-prometheus-node-recording.rules
+      rules:
+        - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
+            BY (instance)
+          record: instance:node_cpu:rate:sum
+        - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
+          record: instance:node_network_receive_bytes:rate:sum
+        - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
+          record: instance:node_network_transmit_bytes:rate:sum
+        - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
+            WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
+            BY (instance, cpu)) BY (instance)
+          record: instance:node_cpu:ratio
+        - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
+          record: cluster:node_cpu:sum_rate5m
+        - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
+            BY (instance, cpu))
+          record: cluster:node_cpu:ratio
+    - name: kube-prometheus-general.rules
+      rules:
+        - expr: count without(instance, pod, node) (up == 1)
+          record: count:up1
+        - expr: count without(instance, pod, node) (up == 0)
+          record: count:up0