Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement canary storage SLO
Browse files Browse the repository at this point in the history
bastjan committed Jul 23, 2024
1 parent 3f9e578 commit 1dd3cee
Showing 8 changed files with 941 additions and 51 deletions.
13 changes: 13 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
@@ -52,6 +52,19 @@ parameters:
_sli:
volume_plugin: "kubernetes.io/csi.+"
operation_name: ".+"
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins_default_params:
size: 1Gi
accessMode: ReadWriteOnce
interval: 1m
maxPodCompletionTimeout: 3m

volume_plugins:
# Empty value for the default plugin
"": {}
ingress:
canary:
enabled: true
82 changes: 82 additions & 0 deletions component/main.jsonnet
Original file line number Diff line number Diff line change
@@ -100,6 +100,87 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
};

local storageCanaries = std.flattenArrays(std.filterMap(
function(storageclass) params.slos.storage.canary._sli.volume_plugins[storageclass] != null,
function(storageclass)
local p = params.slos.storage.canary._sli.volume_plugins_default_params + com.makeMergeable(params.slos.storage.canary._sli.volume_plugins[storageclass]);
local manifestName = 'canary-%s' % if storageclass == '' then 'default' else storageclass;
[
kube.PersistentVolumeClaim(manifestName) {
metadata+: {
namespace: params.namespace,
},
spec+: {
accessModes: [ p.accessMode ],
resources: {
requests: {
storage: p.size,
},
},
[if storageclass != '' then 'storageClassName']: storageclass,
},
},
kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', manifestName) {
metadata+: {
namespace: params.namespace,
},
spec: {
interval: p.interval,
maxPodCompletionTimeout: p.maxPodCompletionTimeout,
forbidParallelRuns: true,
podTemplate: {
metadata: {},
spec: {
affinity: {
nodeAffinity: params.canary_node_affinity,
},
containers: [
{
command: [
'sh',
'-c',
],
args: [
std.join(';', [
'echo test > /testmount/test',
'rm -f /testmount/test',
]),
],
image: 'image-registry.openshift-image-registry.svc:5000/%s/%s:latest' % [ canaryImageStream.metadata.namespace, canaryImageStream.metadata.name ],
imagePullPolicy: 'Always',
name: 'storage',
resources: {},
terminationMessagePath: '/dev/termination-log',
terminationMessagePolicy: 'File',
volumeMounts: [
{
mountPath: '/testmount',
name: 'test',
},
],
},
],
volumes: [
{
name: 'test',
persistentVolumeClaim: {
claimName: manifestName,
},
},
],
restartPolicy: 'Never',
schedulerName: 'default-scheduler',
securityContext: {},
terminationGracePeriodSeconds: 10,
},
},
},
},
]
,
std.objectFields(params.slos.storage.canary._sli.volume_plugins)
));

{
'00_namespace': kube.Namespace(params.namespace) {
metadata+: {
@@ -114,6 +195,7 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
[if params.canary_scheduler_controller.enabled then '30_canaryImageStream']: canaryImageStream,
[if params.canary_scheduler_controller.enabled then '30_canary']: canary,
[if params.canary_scheduler_controller.enabled then '32_storageCanary']: storageCanaries,
}
+ blackbox.deployment
+ blackbox.probes
106 changes: 69 additions & 37 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
@@ -30,8 +30,8 @@ local defaultSlos = {
sli: {
events: {
local queryParams = { namespace: params.namespace },
error_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams,
total_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
@@ -52,43 +52,75 @@ local defaultSlos = {
sloth_input: {
version: 'prometheus/v1',
service: 'storage',
_slos: {
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
_slos: std.foldl(
function(prev, plugin)
local storageClassName = if plugin == '' then 'default' else plugin;
local canaryName = 'canary-%s' % storageClassName;
prev {
[canaryName]: {
description: 'OpenShift workload schedulability SLO based on github.com/appuio/scheduler-canary-controller canary',
sli: {
events: {
local queryParams = { name: canaryName, namespace: params.namespace },
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
name: 'SLO_StorageCanaryWorkloadTimesOut',
annotations: {
summary: 'Storage canary workloads time out.',
},
labels: {
storageclass: storageClassName,
},
page_alert: {},
ticket_alert: {},
},
} + config.canary,
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
std.filter(
function(plugin) config.canary._sli.volume_plugins[plugin] != null,
std.objectFields(config.canary._sli.volume_plugins)
),
{
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
}
),
},
},
ingress: {
10 changes: 10 additions & 0 deletions tests/defaults.yml
Original file line number Diff line number Diff line change
@@ -84,3 +84,13 @@ parameters:
staticConfig:
static:
- https://www.appuio.ch/

slos:
storage:
canary:
_sli:
volume_plugins:
"bulk": {}
"blub": null
"cephfs-fspool-cluster":
accessMode: ReadWriteMany
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
annotations: {}
labels:
name: canary-default
name: canary-default
namespace: appuio-openshift4-slos
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: monitoring.appuio.io/v1beta1
kind: SchedulerCanary
metadata:
annotations: {}
labels:
name: canary-default
name: canary-default
namespace: appuio-openshift4-slos
spec:
forbidParallelRuns: true
interval: 1m
maxPodCompletionTimeout: 3m
podTemplate:
metadata: {}
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/app
operator: Exists
containers:
- args:
- echo test > /testmount/test;rm -f /testmount/test
command:
- sh
- -c
image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest
imagePullPolicy: Always
name: storage
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /testmount
name: test
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 10
volumes:
- name: test
persistentVolumeClaim:
claimName: canary-default
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
annotations: {}
labels:
name: canary-bulk
name: canary-bulk
namespace: appuio-openshift4-slos
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: bulk
---
apiVersion: monitoring.appuio.io/v1beta1
kind: SchedulerCanary
metadata:
annotations: {}
labels:
name: canary-bulk
name: canary-bulk
namespace: appuio-openshift4-slos
spec:
forbidParallelRuns: true
interval: 1m
maxPodCompletionTimeout: 3m
podTemplate:
metadata: {}
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/app
operator: Exists
containers:
- args:
- echo test > /testmount/test;rm -f /testmount/test
command:
- sh
- -c
image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest
imagePullPolicy: Always
name: storage
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /testmount
name: test
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 10
volumes:
- name: test
persistentVolumeClaim:
claimName: canary-bulk
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
annotations: {}
labels:
name: canary-cephfs-fspool-cluster
name: canary-cephfs-fspool-cluster
namespace: appuio-openshift4-slos
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 1Gi
storageClassName: cephfs-fspool-cluster
---
apiVersion: monitoring.appuio.io/v1beta1
kind: SchedulerCanary
metadata:
annotations: {}
labels:
name: canary-cephfs-fspool-cluster
name: canary-cephfs-fspool-cluster
namespace: appuio-openshift4-slos
spec:
forbidParallelRuns: true
interval: 1m
maxPodCompletionTimeout: 3m
podTemplate:
metadata: {}
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/app
operator: Exists
containers:
- args:
- echo test > /testmount/test;rm -f /testmount/test
command:
- sh
- -c
image: image-registry.openshift-image-registry.svc:5000/appuio-openshift4-slos/canary:latest
imagePullPolicy: Always
name: storage
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /testmount
name: test
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 10
volumes:
- name: test
persistentVolumeClaim:
claimName: canary-cephfs-fspool-cluster
567 changes: 567 additions & 0 deletions tests/golden/defaults/openshift4-slos/openshift4-slos/storage.yaml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -11,69 +11,69 @@ spec:
- name: sloth-slo-sli-recordings-workload-schedulability-canary
rules:
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[5m])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[5m])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[5m])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[5m])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 5m
record: slo:sli_error:ratio_rate5m
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[30m])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[30m])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[30m])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[30m])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 30m
record: slo:sli_error:ratio_rate30m
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1h])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[1h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[1h])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 1h
record: slo:sli_error:ratio_rate1h
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[2h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[2h])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[2h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[2h])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 2h
record: slo:sli_error:ratio_rate2h
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[6h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[6h])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[6h])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[6h])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 6h
record: slo:sli_error:ratio_rate6h
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1d])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[1d])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[1d])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[1d])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
sloth_slo: canary
sloth_window: 1d
record: slo:sli_error:ratio_rate1d
- expr: |
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos",reason="timed_out"}[3d])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos",reason="timed_out"}[3d])))
/
(sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="appuio-openshift4-slos"}[3d])))
(sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="appuio-openshift4-slos"}[3d])))
labels:
sloth_id: workload-schedulability-canary
sloth_service: workload-schedulability
2 changes: 2 additions & 0 deletions tests/network-only.yml
Original file line number Diff line number Diff line change
@@ -4,6 +4,8 @@ parameters:
openshift4_slos:
slos:
storage:
canary:
enabled: false
csi-operations:
enabled: false
ingress:

0 comments on commit 1dd3cee

Please sign in to comment.