diff --git a/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet b/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet index c3ba22b20..e9e55504a 100644 --- a/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet +++ b/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet @@ -18,7 +18,7 @@ }, annotations: { description: 'The application **{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** has been out of sync for more than 30 minutes.', - summary: 'Kubernetes version is close to end of support', + summary: 'Kubernetes version is close to end of support.', }, }, { @@ -31,7 +31,7 @@ }, annotations: { description: 'Argo CD WhiteListed Application **{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** sync failed.', - summary: 'The application**{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** has been out of sync for more than 15 minutes.', + summary: 'The application has been out of sync for more than 15 minutes.', }, }, // Inspiration from here https://github.com/adinhodovic/argo-cd-mixin/blob/main/alerts/alerts.libsonnet diff --git a/build/kube-prometheus/mixins/argo-cd/prometheus.yaml b/build/kube-prometheus/mixins/argo-cd/prometheus.yaml index 43e4566f6..581151c56 100644 --- a/build/kube-prometheus/mixins/argo-cd/prometheus.yaml +++ b/build/kube-prometheus/mixins/argo-cd/prometheus.yaml @@ -4,7 +4,7 @@ groups: - alert: WhiteListedApplicationOutOfSync annotations: description: The application **{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** has been out of sync for more than 30 minutes. - summary: Kubernetes version is close to end of support + summary: Kubernetes version is close to end of support. expr: argocd_application_sync_state{argocd_application_name!="", application_namespace!="", whitelisted="true", result="waiting"} == 1 for: 30m labels: @@ -13,7 +13,7 @@ groups: - alert: CronSyncFailed annotations: description: Argo CD WhiteListed Application **{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** sync failed. - summary: The application**{{ .Labels.argocd_application_name }}**/**{{ .Labels.application_namespace }}** has been out of sync for more than 15 minutes. + summary: The application has been out of sync for more than 15 minutes. expr: argocd_application_sync_state{argocd_application_name!="", application_namespace!="", whitelisted="true", result="failed"} == 1 for: 15m labels: diff --git a/build/kube-prometheus/mixins/kube-version/mixin.libsonnet b/build/kube-prometheus/mixins/kube-version/mixin.libsonnet index c589bc42c..a75cd302b 100644 --- a/build/kube-prometheus/mixins/kube-version/mixin.libsonnet +++ b/build/kube-prometheus/mixins/kube-version/mixin.libsonnet @@ -9,55 +9,55 @@ name: 'kubernetes-version-info', rules: [ { - alert: 'KubernetesVersionInfoEOS', + alert: 'KubernetesVersionInfoEos', expr: 'count by (certname, current_version, end_of_support_date) (kubernetes_version_info_eos <= 30 and kubernetes_version_info_eos > 0)', 'for': '15m', labels: { severity: 'warning', - alert_id: 'KubernetesVersionInfoEOS', + alert_id: 'KubernetesVersionInfoEos', }, annotations: { description: 'The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) is **{{ .Value }}** days away from its end of support date which is **{{ .Labels.end_of_support_date }}**. You really should upgrade to ensure you will still get security updates. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information.', - summary: 'Kubernetes version is close to end of support', + summary: 'Kubernetes version is close to end of support.', }, }, { - alert: 'KubernetesVersionInfoEOS', + alert: 'KubernetesVersionInfoEos', expr: 'count by (certname, current_version, end_of_support_date) (kubernetes_version_info_eos < 0 )', 'for': '15m', labels: { severity: 'critical', - alert_id: 'KubernetesVersionInfoEOS', + alert_id: 'KubernetesVersionInfoEos', }, annotations: { description: 'The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) reached its end of support date on **{{ .Labels.end_of_support_date }}**. You really should upgrade to ensure you will still get security updates. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information.', - summary: 'Kubernetes version has reached its end of support', + summary: 'Kubernetes version has reached its end of support.', }, }, { - alert: 'KubernetesVersionInfoEOL', + alert: 'KubernetesVersionInfoEol', expr: 'count by (certname, current_version, end_of_life_date) (kubernetes_version_info_eol <= 60 and kubernetes_version_info_eol > 0)', 'for': '15m', labels: { severity: 'warning', - alert_id: 'KubernetesVersionInfoEOL', + alert_id: 'KubernetesVersionInfoEol', }, annotations: { description: 'The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) is **{{ .Value }}** days away from its end of life date which is **{{ .Labels.end_of_life_date }}**. You really should upgrade soon. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information.', - summary: 'Kubernetes version is close to end of life', + summary: 'Kubernetes version is close to end of life.', }, }, { - alert: 'KubernetesVersionInfoEOL', + alert: 'KubernetesVersionInfoEol', expr: 'count by (certname, current_version, end_of_life_date) (kubernetes_version_info_eol < 0)', 'for': '15m', labels: { severity: 'critical', - alert_id: 'KubernetesVersionInfoEOL', + alert_id: 'KubernetesVersionInfoEol', }, annotations: { description: 'The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) reached its end of life on **{{ .Labels.end_of_life_date }}**. It is now vital to upgrade. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information.', - summary: 'Kubernetes version has reached its end of life', + summary: 'Kubernetes version has reached its end of life.', }, }, ], diff --git a/build/kube-prometheus/mixins/kube-version/prometheus.yaml b/build/kube-prometheus/mixins/kube-version/prometheus.yaml index eb537915e..5d438dc32 100644 --- a/build/kube-prometheus/mixins/kube-version/prometheus.yaml +++ b/build/kube-prometheus/mixins/kube-version/prometheus.yaml @@ -1,39 +1,39 @@ groups: - name: kubernetes-version-info rules: - - alert: KubernetesVersionInfoEOS + - alert: KubernetesVersionInfoEos annotations: description: The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) is **{{ .Value }}** days away from its end of support date which is **{{ .Labels.end_of_support_date }}**. You really should upgrade to ensure you will still get security updates. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information. - summary: Kubernetes version is close to end of support + summary: Kubernetes version is close to end of support. expr: count by (certname, current_version, end_of_support_date) (kubernetes_version_info_eos <= 30 and kubernetes_version_info_eos > 0) for: 15m labels: - alert_id: KubernetesVersionInfoEOS + alert_id: KubernetesVersionInfoEos severity: warning - - alert: KubernetesVersionInfoEOS + - alert: KubernetesVersionInfoEos annotations: description: The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) reached its end of support date on **{{ .Labels.end_of_support_date }}**. You really should upgrade to ensure you will still get security updates. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information. - summary: Kubernetes version has reached its end of support + summary: Kubernetes version has reached its end of support. expr: count by (certname, current_version, end_of_support_date) (kubernetes_version_info_eos < 0 ) for: 15m labels: - alert_id: KubernetesVersionInfoEOS + alert_id: KubernetesVersionInfoEos severity: critical - - alert: KubernetesVersionInfoEOL + - alert: KubernetesVersionInfoEol annotations: description: The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) is **{{ .Value }}** days away from its end of life date which is **{{ .Labels.end_of_life_date }}**. You really should upgrade soon. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information. - summary: Kubernetes version is close to end of life + summary: Kubernetes version is close to end of life. expr: count by (certname, current_version, end_of_life_date) (kubernetes_version_info_eol <= 60 and kubernetes_version_info_eol > 0) for: 15m labels: - alert_id: KubernetesVersionInfoEOL + alert_id: KubernetesVersionInfoEol severity: warning - - alert: KubernetesVersionInfoEOL + - alert: KubernetesVersionInfoEol annotations: description: The Kubernetes version on the cluster **{{ .Labels.certname }}** (that is version **{{ .Labels.current_version }}**) reached its end of life on **{{ .Labels.end_of_life_date }}**. It is now vital to upgrade. Please visit https://kubernetes.io/releases/patch-releases/ for more version related information. - summary: Kubernetes version has reached its end of life + summary: Kubernetes version has reached its end of life. expr: count by (certname, current_version, end_of_life_date) (kubernetes_version_info_eol < 0) for: 15m labels: - alert_id: KubernetesVersionInfoEOL + alert_id: KubernetesVersionInfoEol severity: critical diff --git a/build/kube-prometheus/mixins/mdraid/mixin.libsonnet b/build/kube-prometheus/mixins/mdraid/mixin.libsonnet index 34e69e396..ac2cb5436 100644 --- a/build/kube-prometheus/mixins/mdraid/mixin.libsonnet +++ b/build/kube-prometheus/mixins/mdraid/mixin.libsonnet @@ -17,7 +17,7 @@ alert_id: 'HostRaidDiskDegraded', }, annotations: { - summary: 'RAID Array is degraded', + summary: 'RAID Array is degraded.', description: 'RAID array {{ $labels.device }} on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically', }, }, @@ -30,7 +30,7 @@ alert_id: 'HostRaidDiskFailure', }, annotations: { - summary: 'Host RAID disk failure on instance {{ $labels.instance }}', + summary: 'Host RAID disk failure on instance.', description: 'At least one disk in RAID array {{ $labels.device }} on {{ $labels.instance }} failed. Array {{ $labels.device }} needs attention and possibly a disk swap', }, }, diff --git a/build/kube-prometheus/mixins/mdraid/prometheus.yaml b/build/kube-prometheus/mixins/mdraid/prometheus.yaml index 580e762bc..12419ddc1 100644 --- a/build/kube-prometheus/mixins/mdraid/prometheus.yaml +++ b/build/kube-prometheus/mixins/mdraid/prometheus.yaml @@ -4,7 +4,7 @@ groups: - alert: HostRaidDiskDegraded annotations: description: RAID array {{ $labels.device }} on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically - summary: RAID Array is degraded + summary: RAID Array is degraded. expr: (node_md_state{state="inactive"} > 0) * on(instance) group_left (pod) node_uname_info{nodename=~".+"} for: 15m labels: @@ -13,7 +13,7 @@ groups: - alert: HostRaidDiskFailure annotations: description: At least one disk in RAID array {{ $labels.device }} on {{ $labels.instance }} failed. Array {{ $labels.device }} needs attention and possibly a disk swap - summary: Host RAID disk failure on instance {{ $labels.instance }} + summary: Host RAID disk failure on instance. expr: (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 15m labels: diff --git a/build/kube-prometheus/mixins/monitoring/mixin.libsonnet b/build/kube-prometheus/mixins/monitoring/mixin.libsonnet index 888efce17..cc0c89341 100644 --- a/build/kube-prometheus/mixins/monitoring/mixin.libsonnet +++ b/build/kube-prometheus/mixins/monitoring/mixin.libsonnet @@ -8,16 +8,16 @@ name: 'monitoring', rules: [ { - alert: 'monitor::monitoring_stack::watchdog_missing', + alert: 'WatchdogMissing', expr: 'changes(watchdog_alerts_total[65m]) == 0', 'for': '10m', labels: { severity: 'critical', - alert_id: 'monitor::monitoring_stack::watchdog_missing', + alert_id: 'WatchdogMissing', }, annotations: { description: 'Prometheus Stack instance **{{ $labels.exported_instance }}** has stopped sending watchdog alerts', - summary: 'Prometheus Stack instance **{{ $labels.exported_instance }}** has sent less than 1 watchdog alert in the past 35 minutes.', + summary: 'Prometheus Stack instance has sent less than 1 watchdog alert in the past 35 minutes.', }, }, ], diff --git a/build/kube-prometheus/mixins/monitoring/prometheus.yaml b/build/kube-prometheus/mixins/monitoring/prometheus.yaml index cd0679ad4..353e37adb 100644 --- a/build/kube-prometheus/mixins/monitoring/prometheus.yaml +++ b/build/kube-prometheus/mixins/monitoring/prometheus.yaml @@ -1,12 +1,12 @@ groups: - name: monitoring rules: - - alert: monitor::monitoring_stack::watchdog_missing + - alert: WatchdogMissing annotations: description: Prometheus Stack instance **{{ $labels.exported_instance }}** has stopped sending watchdog alerts - summary: Prometheus Stack instance **{{ $labels.exported_instance }}** has sent less than 1 watchdog alert in the past 35 minutes. + summary: Prometheus Stack instance has sent less than 1 watchdog alert in the past 35 minutes. expr: changes(watchdog_alerts_total[65m]) == 0 for: 10m labels: - alert_id: monitor::monitoring_stack::watchdog_missing + alert_id: WatchdogMissing severity: critical diff --git a/build/kube-prometheus/mixins/smartmon/mixin.libsonnet b/build/kube-prometheus/mixins/smartmon/mixin.libsonnet index a8e951edc..750417c67 100644 --- a/build/kube-prometheus/mixins/smartmon/mixin.libsonnet +++ b/build/kube-prometheus/mixins/smartmon/mixin.libsonnet @@ -19,7 +19,7 @@ annotations: { description: 'Disk **{{ .Labels.device }}** has disk sata failure on instance **{{ .Labels.instance }}** UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **{{ .Value }}** is concerning and indicates potential issues with the data cable or connections.', - summary: 'The device **{{ .Labels.device }}** has disk sata failures.' + summary: 'The device has disk sata failures.' }, }, ], diff --git a/build/kube-prometheus/mixins/smartmon/prometheus.yaml b/build/kube-prometheus/mixins/smartmon/prometheus.yaml index b8a9fdb16..625bff3a8 100644 --- a/build/kube-prometheus/mixins/smartmon/prometheus.yaml +++ b/build/kube-prometheus/mixins/smartmon/prometheus.yaml @@ -6,7 +6,7 @@ groups: description: |- Disk **{{ .Labels.device }}** has disk sata failure on instance **{{ .Labels.instance }}** UDMA_CRC_Error_Count - The number of errors related to data transfer over the interface. A value of **{{ .Value }}** is concerning and indicates potential issues with the data cable or connections. - summary: The device **{{ .Labels.device }}** has disk sata failures. + summary: The device has disk sata failures. expr: sum by (instance, device) (smartctl_device_attribute{attribute_name="UDMA_CRC_Error_Count", attribute_value_type="raw"} >= smartctl_device_attribute{attribute_name="UDMA_CRC_Error_Count", attribute_value_type="worst"}) for: 3h labels: diff --git a/build/kube-prometheus/mixins/velero/mixin.libsonnet b/build/kube-prometheus/mixins/velero/mixin.libsonnet index 5b58874ed..df77c9598 100644 --- a/build/kube-prometheus/mixins/velero/mixin.libsonnet +++ b/build/kube-prometheus/mixins/velero/mixin.libsonnet @@ -21,7 +21,7 @@ }, annotations: { description: 'Velero backup was not successful for {{ $labels.schedule }}.', - summary: 'Velero backup for schedule {{ $labels.schedule }} was unsuccessful.', + summary: 'Velero backup for schedule was unsuccessful.', }, }, ], diff --git a/build/kube-prometheus/mixins/velero/prometheus.yaml b/build/kube-prometheus/mixins/velero/prometheus.yaml index 97102e473..f2a8409f5 100644 --- a/build/kube-prometheus/mixins/velero/prometheus.yaml +++ b/build/kube-prometheus/mixins/velero/prometheus.yaml @@ -4,7 +4,7 @@ groups: - alert: VeleroUnsuccessfulBackup annotations: description: Velero backup was not successful for {{ $labels.schedule }}. - summary: Velero backup for schedule {{ $labels.schedule }} was unsuccessful. + summary: Velero backup for schedule was unsuccessful. expr: | ((time() - velero_backup_last_successful_timestamp{schedule=~".*6hrly.*"}) + on(schedule) group_left velero_backup_attempt_total > (60 * 60 * 6) and ON() hour() >= 6.30 <= 18.30) or ((time() - velero_backup_last_successful_timestamp{schedule=~".*daily.*"}) + on(schedule) group_left velero_backup_attempt_total > (60 * 60 * 24) and ON() day_of_week() != 0) or ((time() - velero_backup_last_successful_timestamp{schedule=~".*weekly.*"}) + on(schedule) group_left velero_backup_attempt_total > (60 * 60 * 24 * 7)) for: 15m diff --git a/build/kube-prometheus/mixins/zfs/mixin.libsonnet b/build/kube-prometheus/mixins/zfs/mixin.libsonnet index 59c33a200..174232b6a 100644 --- a/build/kube-prometheus/mixins/zfs/mixin.libsonnet +++ b/build/kube-prometheus/mixins/zfs/mixin.libsonnet @@ -17,7 +17,7 @@ alert_id: 'ZFSPoolStatus', }, annotations: { - summary: 'ZFS Pool is Degraded', + summary: 'ZFS Pool is Degraded.', description: 'The zfs pool **{{ .Labels.zpool }}** is {{ .Labels.state }} on {{ .Labels.instance }}', }, }, diff --git a/build/kube-prometheus/mixins/zfs/prometheus.yaml b/build/kube-prometheus/mixins/zfs/prometheus.yaml index 949ebf051..d8ec398da 100644 --- a/build/kube-prometheus/mixins/zfs/prometheus.yaml +++ b/build/kube-prometheus/mixins/zfs/prometheus.yaml @@ -4,7 +4,7 @@ groups: - alert: ZFSPoolStatus annotations: description: The zfs pool **{{ .Labels.zpool }}** is {{ .Labels.state }} on {{ .Labels.instance }} - summary: ZFS Pool is Degraded + summary: ZFS Pool is Degraded. expr: node_zfs_zpool_state{state!="online"} > 0 for: 30m labels: