Skip to content

Commit

Permalink
Merge pull request #53 from projectsyn/feature/additional_rules
Browse files Browse the repository at this point in the history
Add addon to add additional rules
  • Loading branch information
TheBigLee authored Sep 28, 2022
2 parents bdd1369 + f503992 commit 22f16d8
Show file tree
Hide file tree
Showing 85 changed files with 26,122 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .cruft.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"template": "https://github.com/projectsyn/commodore-component-template.git",
"commit": "f104a5fd69856c0e8cb7b27860876bd2f6386cf6",
"commit": "32f6976d3a3028d3fdb34ea312454f2ea7fda4a2",
"checkout": "main",
"context": {
"cookiecutter": {
"name": "prometheus",
"slug": "prometheus",
"parameter_key": "prometheus",
"test_cases": "defaults kubernetes_1.20 kubernetes_1.21 kubernetes_1.22 multi openshift rewrite-registries thanos cluster-monitoring kubernetes_1.23 kubernetes_1.24",
"test_cases": "defaults kubernetes_1.20 kubernetes_1.21 kubernetes_1.22 multi openshift rewrite-registries thanos cluster-monitoring kubernetes_1.23 kubernetes_1.24 additional_rules",
"add_lib": "y",
"add_pp": "n",
"add_golden": "y",
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
- cluster-monitoring
- kubernetes_1.23
- kubernetes_1.24
- additional_rules
defaults:
run:
working-directory: ${{ env.COMPONENT_NAME }}
Expand All @@ -68,6 +69,7 @@ jobs:
- cluster-monitoring
- kubernetes_1.23
- kubernetes_1.24
- additional_rules
defaults:
run:
working-directory: ${{ env.COMPONENT_NAME }}
Expand Down
2 changes: 1 addition & 1 deletion Makefile.vars.mk
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ KUBENT_IMAGE ?= docker.io/projectsyn/kubent:latest
KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)

instance ?= defaults
test_instances = tests/defaults.yml tests/kubernetes_1.20.yml tests/kubernetes_1.21.yml tests/kubernetes_1.22.yml tests/multi.yml tests/openshift.yml tests/rewrite-registries.yml tests/thanos.yml tests/cluster-monitoring.yml tests/kubernetes_1.23.yml tests/kubernetes_1.24.yml
test_instances = tests/defaults.yml tests/kubernetes_1.20.yml tests/kubernetes_1.21.yml tests/kubernetes_1.22.yml tests/multi.yml tests/openshift.yml tests/rewrite-registries.yml tests/thanos.yml tests/cluster-monitoring.yml tests/kubernetes_1.23.yml tests/kubernetes_1.24.yml tests/additional_rules.yml
1 change: 1 addition & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ parameters:
addons:
- disable-alerts
addon_configs:
additional_rules: {}
disable_alerts:
# List of alertnames to exclude from the final ruleset
ignoreNames:
Expand Down
31 changes: 31 additions & 0 deletions component/addons/additional-rules.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local prom = import 'lib/prom.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.prometheus;
{
prometheus+: {
prometheusRule+: {
spec+: {
groups+: [
{
name: group_name,
rules: [
local rnamekey =
local k = std.splitLimit(rname, ':', 1);
assert std.member([ 'alert', 'record' ], k[0]) : 'Invalid custom rule key "%s", the component expects that custom rule keys are prefixed with either "alert:" or "record:"' % [ rname ];
k;
params.addon_configs.additional_rules[group_name][rname] {
[rnamekey[0]]: rnamekey[1],
}
for rname in std.objectFields(params.addon_configs.additional_rules[group_name])
if params.addon_configs.additional_rules[group_name][rname] != null
],
}
for group_name in std.objectFields(params.addon_configs.additional_rules)
if params.addon_configs.additional_rules[group_name] != null
],
},
},
},
}
29 changes: 29 additions & 0 deletions docs/modules/ROOT/pages/references/addon-additional-rules.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
= Addon: Additional rules

This addon allows users to configure additional Prometheus rules to deploy on the cluster via the parameter `addon_configs.additional_rules`.

Each key-value pair in the dictionary is transformed into a Prometheus rule group by the component.

The component expects that values are dicts themselves and expects that keys in those dicts are prefixed with `record:` or `alert:` to indicate whether the rule is a recording or alerting rule.
The component will transform the keys into fields in the resulting rule by taking the prefix as the field name and the rest of the key as the field value.
For example, key `"record:sum:some:metric:5m"` would be transformed into `record: sum:some:metric:5m` which should define a recording rule with name `sum:some:metric:5m`.
This field is then merged into the provided value which should be a valid rule definition.

See the Prometheus docs for supported configurations for https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording] and https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting] rules.


Example:

[source,yaml]
---
rules:
generic-rules:
"alert:ContainerOOMKilled":
annotations:
message: A container ({{$labels.container}}) in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOM killed
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
labels:
source: https://git.vshn.net/swisscompks/syn-tenant-repo/-/blob/master/common.yml
severity: devnull
---
1 change: 1 addition & 0 deletions docs/modules/ROOT/partials/nav.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.Addons
** xref:references/addon-oauth2-proxy.adoc[OAuth2 Proxy]
** xref:references/addon-disable-alerts.adoc[Disable Alerts]
** xref:references/addon-additional-rules.adoc[Additional Rules]

.How-Tos
* xref:how-tos/prometheus.adoc[Deploy Prometheus]
Expand Down
36 changes: 36 additions & 0 deletions tests/additional_rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
parameters:
prometheus:
addons:
- additional-rules
addon_configs:
additional_rules:
generic-rules:
"alert:ContainerOOMKilled":
annotations:
message: "A container ({{$labels.container}}) in pod {{ $labels.namespace }}/{{ $labels.pod }} was OOM killed"
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
labels:
severity: devnull
instances:
default-instance:
prometheus:
enabled: true
alertmanager:
enabled: true
# Disabled for some speedup since it's irrelevant for this test
grafana:
enabled: false
nodeExporter:
enabled: true
blackboxExporter:
enabled: true
kubernetesControlPlane:
enabled: true
prometheusAdapter:
enabled: true
kubeStateMetrics:
enabled: true
kubePrometheus:
enabled: true
6 changes: 6 additions & 0 deletions tests/golden/additional_rules/prometheus/apps/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
spec:
ignoreDifferences:
- group: ''
jsonPointers:
- /imagePullSecrets
kind: ServiceAccount
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: v1
kind: Namespace
metadata:
annotations: {}
labels:
SYNMonitoring: main
name: syn-prometheus-operator
name: syn-prometheus-operator
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
source: https://github.com/projectsyn/component-prometheus
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/managed-by: commodore
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: kube-prometheus
monitoring.syn.tools/enabled: 'true'
prometheus: default-instance
role: alert-rules
name: kube-prometheus-rules
namespace: syn-prometheus
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{
$labels.service }} targets in {{ $labels.namespace }} namespace are
down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY
(job, namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting
pipeline is functional.
This alert is always firing, therefore it should always be firing in
Alertmanager
and always fire against a receiver. There are integrations with various
notification
mechanisms that send a notification when this alert is not firing. For
example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up
status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod
}}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: 'changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m])
> 2
'
for: 2m
labels:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0
Loading

0 comments on commit 22f16d8

Please sign in to comment.