Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement canary storage SLO #74

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ parameters:
_sli:
volume_plugin: "kubernetes.io/csi.+"
operation_name: ".+"
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins_default_params:
size: 1Gi
accessMode: ReadWriteOnce
interval: 1m
maxPodCompletionTimeout: 3m

volume_plugins:
# Empty value for the default plugin
"": {}
ingress:
canary:
enabled: true
Expand All @@ -76,6 +89,8 @@ parameters:

specs: {}

secrets: {}

controller_node_affinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
Expand Down
86 changes: 86 additions & 0 deletions component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,90 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
};

local storageCanaries = std.flattenArrays(std.filterMap(
function(storageclass) params.slos.storage.canary._sli.volume_plugins[storageclass] != null,
function(storageclass)
local p = params.slos.storage.canary._sli.volume_plugins_default_params + com.makeMergeable(params.slos.storage.canary._sli.volume_plugins[storageclass]);
local manifestName = 'storage-canary-%s' % if storageclass == '' then 'default' else storageclass;
[
kube.PersistentVolumeClaim(manifestName) {
metadata+: {
namespace: params.namespace,
},
spec+: {
accessModes: [ p.accessMode ],
resources: {
requests: {
storage: p.size,
},
},
[if storageclass != '' then 'storageClassName']: storageclass,
},
},
kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', manifestName) {
metadata+: {
namespace: params.namespace,
},
spec: {
interval: p.interval,
maxPodCompletionTimeout: p.maxPodCompletionTimeout,
forbidParallelRuns: true,
podTemplate: {
metadata: {},
spec: {
affinity: {
nodeAffinity: params.canary_node_affinity,
},
containers: [
{
command: [
'sh',
'-c',
],
args: [
std.join(';\n', [
'set -euo pipefail',
'f="/testmount/t-`date -Iseconds`"',
'echo test > "$f"',
'test `cat "$f"` = "test"',
'rm -f "$f"',
]),
],
image: 'image-registry.openshift-image-registry.svc:5000/%s/%s:latest' % [ canaryImageStream.metadata.namespace, canaryImageStream.metadata.name ],
imagePullPolicy: 'Always',
name: 'storage',
resources: {},
terminationMessagePath: '/dev/termination-log',
terminationMessagePolicy: 'File',
volumeMounts: [
{
mountPath: '/testmount',
name: 'test',
},
],
},
],
volumes: [
{
name: 'test',
persistentVolumeClaim: {
claimName: manifestName,
},
},
],
restartPolicy: 'Never',
schedulerName: 'default-scheduler',
securityContext: {},
terminationGracePeriodSeconds: 10,
},
},
},
},
]
,
std.objectFields(params.slos.storage.canary._sli.volume_plugins)
));

{
'00_namespace': kube.Namespace(params.namespace) {
metadata+: {
Expand All @@ -112,8 +196,10 @@ local canary = kube._Object('monitoring.appuio.io/v1beta1', 'SchedulerCanary', '
},
},
},
'10_secrets': com.generateResources(params.secrets, function(name) com.namespaced(params.namespace, kube.Secret(name))),
[if params.canary_scheduler_controller.enabled then '30_canaryImageStream']: canaryImageStream,
[if params.canary_scheduler_controller.enabled then '30_canary']: canary,
[if params.canary_scheduler_controller.enabled then '32_storageCanary']: storageCanaries,
}
+ blackbox.deployment
+ blackbox.probes
Expand Down
106 changes: 69 additions & 37 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ local defaultSlos = {
sli: {
events: {
local queryParams = { namespace: params.namespace },
error_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s",reason="timed_out"}[{{.window}}]))' % queryParams,
total_query: 'sum by (name) (rate(scheduler_canary_pod_until_completed_seconds_count{exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="canary",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
Expand All @@ -52,43 +52,75 @@ local defaultSlos = {
sloth_input: {
version: 'prometheus/v1',
service: 'storage',
_slos: {
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
_slos: std.foldl(
function(prev, plugin)
local storageClassName = if plugin == '' then 'default' else plugin;
local canaryName = 'storage-canary-%s' % storageClassName;
prev {
[canaryName]: {
description: 'OpenShift workload schedulability SLO based on github.com/appuio/scheduler-canary-controller canary',
sli: {
events: {
local queryParams = { name: canaryName, namespace: params.namespace },
error_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s",reason!="completed"}[{{.window}}]))' % queryParams,
total_query: 'sum(rate(scheduler_canary_pod_until_completed_seconds_count{name="%(name)s",exported_namespace="%(namespace)s"}[{{.window}}]))' % queryParams,
},
},
alerting: {
name: 'SLO_StorageCanaryWorkloadTimesOut',
annotations: {
summary: 'Storage canary workloads time out.',
},
labels: {
storageclass: storageClassName,
},
page_alert: {},
ticket_alert: {},
},
} + config.canary,
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
std.filter(
function(plugin) config.canary._sli.volume_plugins[plugin] != null,
std.objectFields(config.canary._sli.volume_plugins)
),
{
'csi-operations': {
description: 'SLO based on number of failed csi operations',
sli: {
events: {
// We use `or on() vector(0)` here to ensure we always have a
// value for the error query, even if there's 0 failing storage
// operations in a time window. This is necessary because the
// timeseries for status="fail-unknown" may not exist at all if
// there's no failures.
error_query:
'sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s",status="fail-unknown"}[{{.window}}])) or on() vector(0)'
% [ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
total_query:
// We use (sum() > 0) or on() vector(1)) to guard against time
// windows where we have 0 storage operations, which would
// otherwise result in a division by 0. We do this because,
// dividing by 0 results in the whole expression evaluating to
// NaN which breaks the SLO alert.
// Note that we can safely divide by 1, since there can't be
// any failed operations when there's no operations at all, so
// if the `vector(1)` is used, the expression will always
// reduce to 0/1.
'(sum(rate(storage_operation_duration_seconds_count{volume_plugin=~"%s",operation_name=~"%s"}[{{.window}}])) > 0) or on() vector(1)' %
[ config['csi-operations']._sli.volume_plugin, config['csi-operations']._sli.operation_name ],
},
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
},
alerting: {
name: 'SLO_StorageOperationHighErrorRate',
annotations: {
summary: 'High storage operation error rate',
},
page_alert: {},
ticket_alert: {},
},
} + config['csi-operations'],
}
),
},
},
ingress: {
Expand Down
78 changes: 77 additions & 1 deletion docs/modules/ROOT/pages/references/parameters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ Sloth isn't actually deployed to the cluster, but used to render `PrometheusRule
The entry in `images` allows Renovate to create version upgrade PRs.
The Sloth version can be overridden by the `tag` parameter.


== `secrets`

[horizontal]
type:: dictionary
default:: `{}`
example::
+
[source,yaml]
----
secrets:
canary-ssd-encrypted-luks-key:
stringData:
luksKey: XXXXXX
----

This parameter allows creating arbitrary `Secret` resources.

The dictionary keys are used as `metadata.name` for the resulting `Secret` resources.
The secrets are created in the namespace indicated by parameter `namespace`.


== `slos`

[horizontal]
Expand All @@ -51,7 +73,7 @@ csi-operations:
operation_name: ".+"
----

The configuration for the csi-operations SLO.
The configuration for the csi-operations storage SLO.

The SLO can be disabled by setting `enabled` to false.

Expand All @@ -62,6 +84,60 @@ Any additional field is added directly to the `slo` input for sloth.

NOTE: Look at xref:runbooks/storage.adoc#csi-operations[the runbook] for an explanation of this SLO.

=== `slos.storage.canary`

[horizontal]
type:: dictionary
default::
+
[source,yaml]
----
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins_default_params:
size: 1Gi
accessMode: ReadWriteOnce
interval: 1m
maxPodCompletionTimeout: 3m

volume_plugins:
# Empty value for the default plugin
"": {}
----
example::
+
[source,yaml]
----
canary:
enabled: true
objective: 99.0
_sli:
volume_plugins:
# Disable the canary for the default storage class
"": null
# Enable the canaries for ssd and bulk storage classes
ssd: {}
bulk:
size: 10Gi
----

The configuration for the canary storage SLO.

The SLO can be disabled by setting `enabled` to false.

The canary SLO is tested by creating a PVC for every configured storage class and periodically running a pod that writes and deletes a file on the respective PVC.
You can configure which volume plugins are tested with `_sli.volume_plugins`.
The key is the storage class name and the value is a dictionary which can override the default parameters set in `volume_plugins_default_params`.
An empty key (`""`) is used for the default storage class.
The value can be set to `null` to disable the canary for a specific storage class.

Any additional field is added directly to the `slo` input for sloth.

NOTE: Look at xref:runbooks/storage.adoc#canaries[the runbook] for an explanation of this SLO.


=== `slos.kubernetes_api.requests`

[horizontal]
Expand Down
Loading
Loading