Skip to content

Commit

Permalink
chore: Bootstrap alarms (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalezzfelipe authored Jan 16, 2025
1 parent e1b84b9 commit c0a0c87
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 2 deletions.
25 changes: 24 additions & 1 deletion bootstrap/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ module "ogmios_v1_feature" {
api_key_salt = var.api_key_salt
dcu_per_frame = var.dcu_per_frame
dns_zone = var.dns_zone
resources = var.operator_resources
}

module "ogmios_v1_proxy" {
Expand Down Expand Up @@ -76,8 +77,26 @@ module "ogmios_instances" {
ogmios_image = each.value.ogmios_image
node_private_dns = each.value.node_private_dns
ogmios_version = each.value.ogmios_version
tolerations = each.value.tolerations
replicas = each.value.replicas
tolerations = coalesce(each.value.tolerations, [
{
effect = "NoSchedule"
key = "demeter.run/compute-profile"
operator = "Exists"
},
{
effect = "NoSchedule"
key = "demeter.run/compute-arch"
operator = "Equal"
value = "arm64"
},
{
effect = "NoSchedule"
key = "demeter.run/availability-sla"
operator = "Equal"
value = "consistent"
}
])
}

module "ogmios_services" {
Expand All @@ -90,4 +109,8 @@ module "ogmios_services" {
network = each.value.network
}

module "ogmios_monitoring" {
source = "./monitoring"

o11y_datasource_uid = var.o11y_datasource_uid
}
100 changes: 100 additions & 0 deletions bootstrap/monitoring/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
terraform {
required_providers {
grafana = {
source = "grafana/grafana"
version = ">= 1.28.2"
}
}
}

variable "o11y_datasource_uid" {
type = string
}

resource "grafana_folder" "folder" {
title = "Ogmios"
}

resource "grafana_rule_group" "instance_is_down" {
name = "Ogmios is down"
folder_uid = grafana_folder.folder.uid
interval_seconds = 60
org_id = 1

rule {
name = "Ogmios is down"
condition = "B"
for = "5m"
no_data_state = "OK"
exec_err_state = "OK"
annotations = {
description = "We are not receiving more metrics from a particular Ogmios instance.",
summary = "{{ range $k, $v := $values -}}\n{{ if (match \"A[0-9]+\" $k) -}}\nPod: {{ $v.Labels.pod }}\n{{ end }}\n{{ end }}"
}

data {
ref_id = "A"
datasource_uid = var.o11y_datasource_uid

relative_time_range {
from = 3600
to = 0
}

model = jsonencode({
editorMode = "code",
expr = "count(avg_over_time(ogmios_connected[10m] offset 1h)) by (pod) unless count(avg_over_time(ogmios_connected[10m])) by (pod)",
hide = false,
intervalMs = 1000,
legendFormat = "__auto",
maxDataPoints = 43200,
range = true,
refId = "A"
})
}

data {
ref_id = "B"
datasource_uid = "-100"

relative_time_range {
from = 3600
to = 0
}

model = jsonencode({
conditions = [
{
evaluator = {
params = [0]
type = "gt"
},
operator = {
type = "and"
},
query = {
params : [
"A"
]
},
reducer = {
params = [],
type = "count_non_null"
},
type = "query"
}
],
datasource = {
type = "__expr__",
uid = "-100"
},
expression = "A",
hide = false,
intervalMs = 1000,
maxDataPoints = 43200,
refId = "B",
type = "classic_conditions"
})
}
}
}
5 changes: 4 additions & 1 deletion bootstrap/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ variable "versions" {
default = ["5", "6"]
}

variable "o11y_datasource_uid" {
type = string
}

// operator settings

variable "operator_image_tag" {
Expand Down Expand Up @@ -145,7 +149,6 @@ variable "instances" {
ogmios_image = string
node_private_dns = string
ogmios_version = string
compute_arch = string
replicas = number
resources = optional(object({
limits = object({
Expand Down

0 comments on commit c0a0c87

Please sign in to comment.