diff --git a/class/defaults.yml b/class/defaults.yml index a272010..dc969e3 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -52,6 +52,19 @@ parameters: _sli: volume_plugin: "kubernetes.io/csi.+" operation_name: ".+" + canary: + enabled: true + objective: 99.0 + _sli: + volume_plugins_default_params: + size: 1Gi + accessMode: ReadWriteOnce + interval: 1m + maxPodCompletionTimeout: 3m + + volume_plugins: + # Empty value for the default plugin + "": {} ingress: canary: enabled: true @@ -76,6 +89,8 @@ parameters: specs: {} + secrets: {} + controller_node_affinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: diff --git a/component/main.jsonnet b/component/main.jsonnet index 84f63a1..ac6c502 100644 --- a/component/main.jsonnet +++ b/component/main.jsonnet @@ -100,6 +100,90 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', ' }, }; +local storageCanaries = std.flattenArrays(std.filterMap( + function(storageclass) params.slos.storage.canary._sli.volume_plugins[storageclass] != null, + function(storageclass) + local p = params.slos.storage.canary._sli.volume_plugins_default_params + com.makeMergeable(params.slos.storage.canary._sli.volume_plugins[storageclass]); + local manifestName = 'storage-canary-%s' % if storageclass == '' then 'default' else storageclass; + [ + kube.PersistentVolumeClaim(manifestName) { + metadata+: { + namespace: params.namespace, + }, + spec+: { + accessModes: [ p.accessMode ], + resources: { + requests: { + storage: p.size, + }, + }, + [if storageclass != '' then 'storageClassName']: storageclass, + }, + }, + kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', manifestName) { + metadata+: { + namespace: params.namespace, + }, + spec: { + interval: p.interval, + maxPodCompletionTimeout: p.maxPodCompletionTimeout, + forbidParallelRuns: true, + podTemplate: { + metadata: {}, + spec: { + affinity: { + nodeAffinity: params.canary_node_affinity, + }, + containers: [ + { + command: [ + 'sh', + '-c', + ], + args: [ + std.join(';\n', [ + 'set -euo pipefail', + 'f="/testmount/t-`date -Iseconds`"', + 'echo test > "$f"', + 'test `cat "$f"` = "test"', + 'rm -f "$f"', + ]), + ], + image: 'image-registry.openshift-image-registry.svc:5000/%s/%s:latest' % [ canaryImageStream.metadata.namespace, canaryImageStream.metadata.name ], + imagePullPolicy: 'Always', + name: 'storage', + resources: {}, + terminationMessagePath: '/dev/termination-log', + terminationMessagePolicy: 'File', + volumeMounts: [ + { + mountPath: '/testmount', + name: 'test', + }, + ], + }, + ], + volumes: [ + { + name: 'test', + persistentVolumeClaim: { + claimName: manifestName, + }, + }, + ], + restartPolicy: 'Never', + schedulerName: 'default-scheduler', + securityContext: {}, + terminationGracePeriodSeconds: 10, + }, + }, + }, + }, + ] + , + std.objectFields(params.slos.storage.canary._sli.volume_plugins) +)); + { '00_namespace': kube.Namespace(params.namespace) { metadata+: { @@ -112,8 +196,10 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', ' }, }, }, + '10_secrets': com.generateResources(params.secrets, function(name) com.namespaced(params.namespace, kube.Secret(name))), [if params.canary_scheduler_controller.enabled then '30_canaryImageStream']: canaryImageStream, [if params.canary_scheduler_controller.enabled then '30_canary']: canary, + [if params.canary_scheduler_controller.enabled then '32_storageCanary']: storageCanaries, } + blackbox.deployment + blackbox.probes diff --git a/component/slos.libsonnet b/component/slos.libsonnet index 141a1bf..61dcc9a 100644 --- a/component/slos.libsonnet +++ b/component/slos.libsonnet @@ -30,8 +30,8 @@ local defaultSlos = { sli: { events: { local queryParams = { namespace: params.namespace }, - error_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams, - total_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams, + error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams, + total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams, }, }, alerting: { @@ -52,43 +52,75 @@ local defaultSlos = { sloth_input: { version: 'prometheus/v1', service: 'storage', - _slos: { - 'csi-operations': { - description: 'SLO based on number of failed csi operations', - sli: { - events: { - // We use `or on() vector(0)` here to ensure we always have a - // value for the error query, even if there's 0 failing storage - // operations in a time window. This is necessary because the - // timeseries for status="fail-unknown" may not exist at all if - // there's no failures. - error_query: - 'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)' - % [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ], - total_query: - // We use (sum() > 0) or on() vector(1)) to guard against time - // windows where we have 0 storage operations, which would - // otherwise result in a division by 0. We do this because, - // dividing by 0 results in the whole expression evaluating to - // NaN which breaks the SLO alert. - // Note that we can safely divide by 1, since there can't be - // any failed operations when there's no operations at all, so - // if the `vector(1)` is used, the expression will always - // reduce to 0/1. - '(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' % - [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ], - }, + _slos: std.foldl( + function(prev, plugin) + local storageClassName = if plugin == '' then 'default' else plugin; + local canaryName = 'storage-canary-%s' % storageClassName; + prev { + [canaryName]: { + description: 'OpenShift workload schedulability SLO based on github.com/appuio/scheduler-canary-controller canary', + sli: { + events: { + local queryParams = { name: canaryName, namespace: params.namespace }, + error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams, + total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams, + }, + }, + alerting: { + name: 'SLO_StorageCanaryWorkloadTimesOut', + annotations: { + summary: 'Storage canary workloads time out.', + }, + labels: { + storageclass: storageClassName, + }, + page_alert: {}, + ticket_alert: {}, + }, + } + config.canary, }, - alerting: { - name: 'SLO_StorageOperationHighErrorRate', - annotations: { - summary: 'High storage operation error rate', + std.filter( + function(plugin) config.canary._sli.volume_plugins[plugin] != null, + std.objectFields(config.canary._sli.volume_plugins) + ), + { + 'csi-operations': { + description: 'SLO based on number of failed csi operations', + sli: { + events: { + // We use `or on() vector(0)` here to ensure we always have a + // value for the error query, even if there's 0 failing storage + // operations in a time window. This is necessary because the + // timeseries for status="fail-unknown" may not exist at all if + // there's no failures. + error_query: + 'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)' + % [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ], + total_query: + // We use (sum() > 0) or on() vector(1)) to guard against time + // windows where we have 0 storage operations, which would + // otherwise result in a division by 0. We do this because, + // dividing by 0 results in the whole expression evaluating to + // NaN which breaks the SLO alert. + // Note that we can safely divide by 1, since there can't be + // any failed operations when there's no operations at all, so + // if the `vector(1)` is used, the expression will always + // reduce to 0/1. + '(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' % + [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ], + }, }, - page_alert: {}, - ticket_alert: {}, - }, - } + config['csi-operations'], - }, + alerting: { + name: 'SLO_StorageOperationHighErrorRate', + annotations: { + summary: 'High storage operation error rate', + }, + page_alert: {}, + ticket_alert: {}, + }, + } + config['csi-operations'], + } + ), }, }, ingress: { diff --git a/docs/modules/ROOT/pages/references/parameters.adoc b/docs/modules/ROOT/pages/references/parameters.adoc index 15aa17f..479eb58 100644 --- a/docs/modules/ROOT/pages/references/parameters.adoc +++ b/docs/modules/ROOT/pages/references/parameters.adoc @@ -28,6 +28,28 @@ Sloth isn't actually deployed to the cluster, but used to render `PrometheusRule The entry in `images` allows Renovate to create version upgrade PRs. The Sloth version can be overridden by the `tag` parameter. + +== `secrets` + +[horizontal] +type:: dictionary +default:: `{}` +example:: ++ +[source,yaml] +---- +secrets: + canary-ssd-encrypted-luks-key: + stringData: + luksKey: XXXXXX +---- + +This parameter allows creating arbitrary `Secret` resources. + +The dictionary keys are used as `metadata.name` for the resulting `Secret` resources. +The secrets are created in the namespace indicated by parameter `namespace`. + + == `slos` [horizontal] @@ -51,7 +73,7 @@ csi-operations: operation_name: ".+" ---- -The configuration for the csi-operations SLO. +The configuration for the csi-operations storage SLO. The SLO can be disabled by setting `enabled` to false. @@ -62,6 +84,60 @@ Any additional field is added directly to the `slo` input for sloth. NOTE: Look at xref:runbooks/storage.adoc#csi-operations[the runbook] for an explanation of this SLO. +=== `slos.storage.canary` + +[horizontal] +type:: dictionary +default:: ++ +[source,yaml] +---- +canary: + enabled: true + objective: 99.0 + _sli: + volume_plugins_default_params: + size: 1Gi + accessMode: ReadWriteOnce + interval: 1m + maxPodCompletionTimeout: 3m + + volume_plugins: + # Empty value for the default plugin + "": {} +---- +example:: ++ +[source,yaml] +---- +canary: + enabled: true + objective: 99.0 + _sli: + volume_plugins: + # Disable the canary for the default storage class + "": null + # Enable the canaries for ssd and bulk storage classes + ssd: {} + bulk: + size: 10Gi +---- + +The configuration for the canary storage SLO. + +The SLO can be disabled by setting `enabled` to false. + +The canary SLO is tested by creating a PVC for every configured storage class and periodically running a pod that writes and deletes a file on the respective PVC. +You can configure which volume plugins are tested with `_sli.volume_plugins`. +The key is the storage class name and the value is a dictionary which can override the default parameters set in `volume_plugins_default_params`. +An empty key (`""`) is used for the default storage class. +The value can be set to `null` to disable the canary for a specific storage class. + +Any additional field is added directly to the `slo` input for sloth. + +NOTE: Look at xref:runbooks/storage.adoc#canaries[the runbook] for an explanation of this SLO. + + === `slos.kubernetes_api.requests` [horizontal] diff --git a/docs/modules/ROOT/pages/runbooks/storage.adoc b/docs/modules/ROOT/pages/runbooks/storage.adoc index 5073c0d..98011c7 100644 --- a/docs/modules/ROOT/pages/runbooks/storage.adoc +++ b/docs/modules/ROOT/pages/runbooks/storage.adoc @@ -65,3 +65,91 @@ slos: ---- include::partial$runbooks/objective_change_warning.adoc[] + +[[canaries]] +== Canaries + +=== icon:glasses[] Overview + +This SLO measures the percentage of pods timed out or failed during a complete lifecycle, measured while waiting for a canary pods. +The Pod mounts a PVC and writes a file to it, then removes the file and the PVC. +In the current implementation the image for the pod is pulled from the built-in OpenShift registry. + +include::partial$runbooks/alert_types.adoc[] + +=== icon:bug[] Steps for debugging + +An unschedulable storage canary workload can have more root causes than just a storage issue. + +First check the debugging section of the xref:runbooks/workload-schedulability.adoc#canary[Workload Schedulability - Canary] runbook. + +Check the output of the `kubectl describe` documented in the run book above for storage specific issues: + +[source] +---- +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedMount 47s kubelet MountVolume.SetUp failed for volume "pvc-d0fd0b25-60dc-47dd-9ec8-734941361dff" : mount failed: exit status 32 +Mounting command: mount +Mounting arguments: -t nfs nfs-store3.example.ch:/srv/nfs/export/canary-nfs-pvc-d0fd0b25-60dc-47dd-9ec8-734941361dff /var/lib/kubelet/pods/9dbc5afa-5af1-4176-bc06-8a428a5448a8/volumes/kubernetes.io~nfs/pvc-d0fd0b25-60dc-47dd-9ec8-734941361dff +Output: mount.nfs: Connection refused + +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedMount 23m kubelet MountVolume.SetUp failed for volume "pvc-7681612b-9ac4-4cd6-867c-14c990351078" : mount failed: No space left on device +---- + +If the pod goes into a `Error` state you can check the logs of the pod with `kubectl logs`. + +[source,bash] +---- +kubectl describe pods -A -l "scheduler-canary-controller.appuio.io/instance" | grep Error + +kubectl logs -n +---- + +Check the PVC and PV status with `kubectl get pvc` and `kubectl get pv` and check the logs of the CSI plugin for more information. + +[source,bash] +---- +# Extract the pod name and PVC name +kubectl get pods -A -l "scheduler-canary-controller.appuio.io/instance" -o=jsonpath="{range .items[*]}{.metadata.name}{'\t'}{.spec.volumes[*].persistentVolumeClaim.claimName}{'\n'}{end}" + +# Check the PVC status +kubectl describe pvc -n + +# Extract the PV name from the PVC +kubectl get pvc -n -o=jsonpath="{.spec.volumeName}" +# Check the PV status +kubectl describe pv +---- + +include::partial$runbooks/wip_note.adoc[] + +=== icon:wrench[] Tune + +If this alert isn't actionable, noisy, or was raised too late you may want to tune the SLO. + +You have the option tune the SLO through the component parameters. +You can modify the objective, disable the page or ticket alert, restrict the SLO to certain storage operations or volume plugins, or completely disable the SLO. + +The example below will set the SLO set the objective to 97.00%, disable the page alert, and only consider the Syn-managed CephFS storage provider. +This means this SLO won't alert on-call anymore and won't alert if there are issues with CSI plugins other than the Syn-managed CephFS CSI plugin. + +[source,yaml] +---- +slos: + storage: + canary: + objective: 99.0 + _sli: + volume_plugins: + # Disable default storage class canary + "": null + # Enable Syn-managed CephFS canary + cephfs-fspool-cluster: {} +---- + +include::partial$runbooks/objective_change_warning.adoc[] diff --git a/docs/modules/ROOT/pages/runbooks/workload-schedulability.adoc b/docs/modules/ROOT/pages/runbooks/workload-schedulability.adoc index 0a0340a..6b647a2 100644 --- a/docs/modules/ROOT/pages/runbooks/workload-schedulability.adoc +++ b/docs/modules/ROOT/pages/runbooks/workload-schedulability.adoc @@ -7,7 +7,7 @@ include::partial$runbooks/contribution_note.adoc[] === icon:glasses[] Overview -This SLO measures the percentage of pods timed out during a complete lifecycle, measured while waiting for a canary pods. +This SLO measures the percentage of pods timed out or failed during a complete lifecycle, measured while waiting for a canary pods. In the current implementation the image for the pod is pulled from the built-in OpenShift registry. The error rate is a general indicator of cluster and workload health. diff --git a/tests/defaults.yml b/tests/defaults.yml index a68a6ef..86cfc17 100644 --- a/tests/defaults.yml +++ b/tests/defaults.yml @@ -84,3 +84,18 @@ parameters: staticConfig: static: - https://www.appuio.ch/ + + secrets: + canary-ssd-encrypted-luks-key: + stringData: + luksKey: XXXXXX + + slos: + storage: + canary: + _sli: + volume_plugins: + "bulk": {} + "blub": null + "cephfs-fspool-cluster": + accessMode: ReadWriteMany diff --git a/tests/golden/defaults/openshift4-slos/openshift4-slos/10_secrets.yaml b/tests/golden/defaults/openshift4-slos/openshift4-slos/10_secrets.yaml new file mode 100644 index 0000000..6546d7a --- /dev/null +++ b/tests/golden/defaults/openshift4-slos/openshift4-slos/10_secrets.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +data: {} +kind: Secret +metadata: + annotations: {} + labels: + name: canary-ssd-encrypted-luks-key + name: canary-ssd-encrypted-luks-key + namespace: appuio-openshift4-slos +stringData: + luksKey: XXXXXX +type: Opaque diff --git a/tests/golden/defaults/openshift4-slos/openshift4-slos/32_storageCanary.yaml b/tests/golden/defaults/openshift4-slos/openshift4-slos/32_storageCanary.yaml new file mode 100644 index 0000000..ed6be70 --- /dev/null +++ b/tests/golden/defaults/openshift4-slos/openshift4-slos/32_storageCanary.yaml @@ -0,0 +1,199 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + annotations: {} + labels: + name: storage-canary-default + name: storage-canary-default + namespace: appuio-openshift4-slos +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi +--- +apiVersion: monitoring.appuio.io/v1beta1 +kind: SchedulerCanary +metadata: + annotations: {} + labels: + name: storage-canary-default + name: storage-canary-default + namespace: appuio-openshift4-slos +spec: + forbidParallelRuns: true + interval: 1m + maxPodCompletionTimeout: 3m + podTemplate: + metadata: {} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/app + operator: Exists + containers: + - args: + - |- + set -euo pipefail; + f="/testmount/t-`date -Iseconds`"; + echo test > "$f"; + test `cat "$f"` = "test"; + rm -f "$f" + command: + - sh + - -c + image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest + imagePullPolicy: Always + name: storage + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /testmount + name: test + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 10 + volumes: + - name: test + persistentVolumeClaim: + claimName: storage-canary-default +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + annotations: {} + labels: + name: storage-canary-bulk + name: storage-canary-bulk + namespace: appuio-openshift4-slos +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + storageClassName: bulk +--- +apiVersion: monitoring.appuio.io/v1beta1 +kind: SchedulerCanary +metadata: + annotations: {} + labels: + name: storage-canary-bulk + name: storage-canary-bulk + namespace: appuio-openshift4-slos +spec: + forbidParallelRuns: true + interval: 1m + maxPodCompletionTimeout: 3m + podTemplate: + metadata: {} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/app + operator: Exists + containers: + - args: + - |- + set -euo pipefail; + f="/testmount/t-`date -Iseconds`"; + echo test > "$f"; + test `cat "$f"` = "test"; + rm -f "$f" + command: + - sh + - -c + image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest + imagePullPolicy: Always + name: storage + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /testmount + name: test + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 10 + volumes: + - name: test + persistentVolumeClaim: + claimName: storage-canary-bulk +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + annotations: {} + labels: + name: storage-canary-cephfs-fspool-cluster + name: storage-canary-cephfs-fspool-cluster + namespace: appuio-openshift4-slos +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 1Gi + storageClassName: cephfs-fspool-cluster +--- +apiVersion: monitoring.appuio.io/v1beta1 +kind: SchedulerCanary +metadata: + annotations: {} + labels: + name: storage-canary-cephfs-fspool-cluster + name: storage-canary-cephfs-fspool-cluster + namespace: appuio-openshift4-slos +spec: + forbidParallelRuns: true + interval: 1m + maxPodCompletionTimeout: 3m + podTemplate: + metadata: {} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/app + operator: Exists + containers: + - args: + - |- + set -euo pipefail; + f="/testmount/t-`date -Iseconds`"; + echo test > "$f"; + test `cat "$f"` = "test"; + rm -f "$f" + command: + - sh + - -c + image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest + imagePullPolicy: Always + name: storage + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /testmount + name: test + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 10 + volumes: + - name: test + persistentVolumeClaim: + claimName: storage-canary-cephfs-fspool-cluster diff --git a/tests/golden/defaults/openshift4-slos/openshift4-slos/storage.yaml b/tests/golden/defaults/openshift4-slos/openshift4-slos/storage.yaml index 95b5ad5..32990eb 100644 --- a/tests/golden/defaults/openshift4-slos/openshift4-slos/storage.yaml +++ b/tests/golden/defaults/openshift4-slos/openshift4-slos/storage.yaml @@ -195,3 +195,570 @@ spec: sloth_severity: ticket syn: 'true' syn_component: openshift4-slos + - name: sloth-slo-sli-recordings-storage-storage-canary-bulk + rules: + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[5m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[5m]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[30m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[30m]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[1h]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[2h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[2h]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[6h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[6h]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[1d]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos",reason!="completed"}[3d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-bulk",exported_namespace="appuio-openshift4-slos"}[3d]))) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"}[30d]) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-storage-storage-canary-bulk + rules: + - expr: vector(0.99) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:objective:ratio + - expr: vector(1-0.99) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="storage-storage-canary-bulk", + sloth_service="storage", sloth_slo="storage-canary-bulk"} + labels: + sloth_id: storage-storage-canary-bulk + sloth_service: storage + sloth_slo: storage-canary-bulk + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: storage-storage-canary-bulk + sloth_mode: cli-gen-prom + sloth_objective: '99' + sloth_service: storage + sloth_slo: storage-canary-bulk + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-storage-storage-canary-bulk + rules: + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-bulk + summary: Storage canary workloads time out. + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (14.4 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (14.4 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (6 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (6 * 0.01)) without (sloth_window) + ) + labels: + severity: critical + slo: 'true' + sloth_severity: page + storageclass: bulk + syn: 'true' + syn_component: openshift4-slos + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-bulk + summary: Storage canary workloads time out. + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (3 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (3 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (1 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="storage-storage-canary-bulk", sloth_service="storage", sloth_slo="storage-canary-bulk"} > (1 * 0.01)) without (sloth_window) + ) + labels: + severity: warning + slo: 'true' + sloth_severity: ticket + storageclass: bulk + syn: 'true' + syn_component: openshift4-slos + - name: sloth-slo-sli-recordings-storage-storage-canary-cephfs-fspool-cluster + rules: + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[5m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[5m]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[30m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[30m]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[1h]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[2h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[2h]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[6h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[6h]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[1d]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos",reason!="completed"}[3d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-cephfs-fspool-cluster",exported_namespace="appuio-openshift4-slos"}[3d]))) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"}[30d]) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-storage-storage-canary-cephfs-fspool-cluster + rules: + - expr: vector(0.99) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:objective:ratio + - expr: vector(1-0.99) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="storage-storage-canary-cephfs-fspool-cluster", + sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: storage-storage-canary-cephfs-fspool-cluster + sloth_mode: cli-gen-prom + sloth_objective: '99' + sloth_service: storage + sloth_slo: storage-canary-cephfs-fspool-cluster + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-storage-storage-canary-cephfs-fspool-cluster + rules: + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-cephfs-fspool-cluster + summary: Storage canary workloads time out. + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (14.4 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (14.4 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (6 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (6 * 0.01)) without (sloth_window) + ) + labels: + severity: critical + slo: 'true' + sloth_severity: page + storageclass: cephfs-fspool-cluster + syn: 'true' + syn_component: openshift4-slos + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-cephfs-fspool-cluster + summary: Storage canary workloads time out. + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (3 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (3 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (1 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="storage-storage-canary-cephfs-fspool-cluster", sloth_service="storage", sloth_slo="storage-canary-cephfs-fspool-cluster"} > (1 * 0.01)) without (sloth_window) + ) + labels: + severity: warning + slo: 'true' + sloth_severity: ticket + storageclass: cephfs-fspool-cluster + syn: 'true' + syn_component: openshift4-slos + - name: sloth-slo-sli-recordings-storage-storage-canary-default + rules: + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[5m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[5m]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 5m + record: slo:sli_error:ratio_rate5m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[30m]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[30m]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 30m + record: slo:sli_error:ratio_rate30m + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[1h]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 1h + record: slo:sli_error:ratio_rate1h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[2h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[2h]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 2h + record: slo:sli_error:ratio_rate2h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[6h]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[6h]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 6h + record: slo:sli_error:ratio_rate6h + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[1d]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 1d + record: slo:sli_error:ratio_rate1d + - expr: | + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos",reason!="completed"}[3d]))) + / + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="storage-canary-default",exported_namespace="appuio-openshift4-slos"}[3d]))) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 3d + record: slo:sli_error:ratio_rate3d + - expr: | + sum_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"}[30d]) + / ignoring (sloth_window) + count_over_time(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"}[30d]) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + sloth_window: 30d + record: slo:sli_error:ratio_rate30d + - name: sloth-slo-meta-recordings-storage-storage-canary-default + rules: + - expr: vector(0.99) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:objective:ratio + - expr: vector(1-0.99) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:error_budget:ratio + - expr: vector(30) + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:time_period:days + - expr: | + slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:current_burn_rate:ratio + - expr: | + slo:sli_error:ratio_rate30d{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} + / on(sloth_id, sloth_slo, sloth_service) group_left + slo:error_budget:ratio{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:period_burn_rate:ratio + - expr: 1 - slo:period_burn_rate:ratio{sloth_id="storage-storage-canary-default", + sloth_service="storage", sloth_slo="storage-canary-default"} + labels: + sloth_id: storage-storage-canary-default + sloth_service: storage + sloth_slo: storage-canary-default + record: slo:period_error_budget_remaining:ratio + - expr: vector(1) + labels: + sloth_id: storage-storage-canary-default + sloth_mode: cli-gen-prom + sloth_objective: '99' + sloth_service: storage + sloth_slo: storage-canary-default + sloth_spec: prometheus/v1 + sloth_version: v0.11.0 + record: sloth_slo_info + - name: sloth-slo-alerts-storage-storage-canary-default + rules: + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-default + summary: Storage canary workloads time out. + title: (page) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate5m{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (14.4 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1h{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (14.4 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate30m{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (6 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (6 * 0.01)) without (sloth_window) + ) + labels: + severity: critical + slo: 'true' + sloth_severity: page + storageclass: default + syn: 'true' + syn_component: openshift4-slos + - alert: SLO_StorageCanaryWorkloadTimesOut + annotations: + runbook_url: https://hub.syn.tools/openshift4-slos/runbooks/storage.html#storage-canary-default + summary: Storage canary workloads time out. + title: (ticket) {{$labels.sloth_service}} {{$labels.sloth_slo}} SLO error + budget burn rate is too fast. + expr: | + ( + max(slo:sli_error:ratio_rate2h{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (3 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate1d{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (3 * 0.01)) without (sloth_window) + ) + or + ( + max(slo:sli_error:ratio_rate6h{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (1 * 0.01)) without (sloth_window) + and + max(slo:sli_error:ratio_rate3d{sloth_id="storage-storage-canary-default", sloth_service="storage", sloth_slo="storage-canary-default"} > (1 * 0.01)) without (sloth_window) + ) + labels: + severity: warning + slo: 'true' + sloth_severity: ticket + storageclass: default + syn: 'true' + syn_component: openshift4-slos diff --git a/tests/golden/defaults/openshift4-slos/openshift4-slos/workload-schedulability.yaml b/tests/golden/defaults/openshift4-slos/openshift4-slos/workload-schedulability.yaml index 5fd4e9d..202597f 100644 --- a/tests/golden/defaults/openshift4-slos/openshift4-slos/workload-schedulability.yaml +++ b/tests/golden/defaults/openshift4-slos/openshift4-slos/workload-schedulability.yaml @@ -11,9 +11,9 @@ spec: - name: sloth-slo-sli-recordings-workload-schedulability-canary rules: - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[5m]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[5m]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[5m]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[5m]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -21,9 +21,9 @@ spec: sloth_window: 5m record: slo:sli_error:ratio_rate5m - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[30m]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[30m]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[30m]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[30m]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -31,9 +31,9 @@ spec: sloth_window: 30m record: slo:sli_error:ratio_rate30m - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1h]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[1h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[1h]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -41,9 +41,9 @@ spec: sloth_window: 1h record: slo:sli_error:ratio_rate1h - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[2h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[2h]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[2h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[2h]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -51,9 +51,9 @@ spec: sloth_window: 2h record: slo:sli_error:ratio_rate2h - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[6h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[6h]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[6h]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[6h]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -61,9 +61,9 @@ spec: sloth_window: 6h record: slo:sli_error:ratio_rate6h - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1d]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[1d]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[1d]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[1d]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability @@ -71,9 +71,9 @@ spec: sloth_window: 1d record: slo:sli_error:ratio_rate1d - expr: | - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[3d]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason!="completed"}[3d]))) / - (sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[3d]))) + (sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[3d]))) labels: sloth_id: workload-schedulability-canary sloth_service: workload-schedulability diff --git a/tests/golden/network-only/openshift4-slos/openshift4-slos/10_secrets.yaml b/tests/golden/network-only/openshift4-slos/openshift4-slos/10_secrets.yaml new file mode 100644 index 0000000..e69de29 diff --git a/tests/network-only.yml b/tests/network-only.yml index 904f958..baed32f 100644 --- a/tests/network-only.yml +++ b/tests/network-only.yml @@ -4,6 +4,8 @@ parameters: openshift4_slos: slos: storage: + canary: + enabled: false csi-operations: enabled: false ingress: