Skip to content

Commit

Permalink
Improve alerting
Browse files Browse the repository at this point in the history
- Add more alerting test domains
- alert if at least 2 domains fail
- don't test mail on domains that have no MX
- also measure if probe passed or not
- don't measure scores for probes that don't count towards the total score (appsecpriv)
- add metrics for failed probes on timeout
  • Loading branch information
aequitas committed Sep 18, 2024
1 parent 6d587d8 commit 629b5fb
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 52 deletions.
50 changes: 42 additions & 8 deletions docker/cron/periodic/15min/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,30 @@
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
HEADERS = {"Host": INTERNETNL_DOMAINNAME}

# domain's to use in website tests
WEBSITE_TEST_DOMAINS = [
"example.nl",
"example.com",
]
TEST_DOMAINS = {
# domain's to use in website tests
"site": [
"internet.nl",
"example.nl",
"example.com",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
# domain's to use in mail tests
"mail": [
"internet.nl",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
}


METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test", "domain", "probe"])
Expand All @@ -46,12 +65,13 @@
"tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test", "domain", "probe"]
)
METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test", "domain", "probe"])
METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test", "domain", "probe"])

METRIC_TEST_RUN = Gauge("tests_test_run_total", "Test that have been run.", ["test", "domain"])
METRIC_TEST_CACHE = Gauge("tests_test_cached_total", "Test runs that returned cached results.", ["test", "domain"])
METRIC_TEST_FAILURE = Gauge("tests_test_failure_total", "Test runs that failed.", ["test", "domain"])
METRIC_TEST_SUCCESS = Gauge("tests_test_success_total", "Test runs that succeeded.", ["test", "domain"])
METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout", "Test that ran into timeout.", ["test", "domain"])
METRIC_TEST_TIMEOUT = Gauge("tests_test_timeout_total", "Test that ran into timeout.", ["test", "domain"])
METRIC_TEST_RUNTIME = Gauge("tests_test_runtime_seconds", "Amount of time test ran before done.", ["test", "domain"])


Expand Down Expand Up @@ -99,11 +119,20 @@ def run_tests_on_domain(test, domain):

# stop when all probes are finished
if not [p for p in probes if not p["done"]]:
METRIC_TEST_SUCCESS.labels(test, domain).set(1)
break

time.sleep(1)
else:
# test timed out because one or more of the probes was not done within time
METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
for probe in probes:
if probe["name"] in finished_probes:
continue
# record not finished probes as failed
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))

METRIC_TEST_RUNTIME.labels(test, domain).set(int(time.time() - test_start))

Expand All @@ -119,18 +148,23 @@ def run_tests_on_domain(test, domain):
r.raise_for_status()
if r.status_code == 200:
probe_result = r.json()
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
# only measure probe scores that count towards total score
if probe_result["maxscore"]:
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
except Exception:
log.exception("failed to get probe score")


def run_tests():
for test in TESTS:
for domain in WEBSITE_TEST_DOMAINS:
for domain in TEST_DOMAINS[test]:
log.info(f"testing: {test} {domain}")
METRIC_TEST_RUN.labels(test, domain).set(1)
METRIC_TEST_CACHE.labels(test, domain).set(0)
METRIC_TEST_FAILURE.labels(test, domain).set(0)
METRIC_TEST_TIMEOUT.labels(test, domain).set(0)
METRIC_TEST_SUCCESS.labels(test, domain).set(0)
try:
run_tests_on_domain(test, domain)
except Exception:
Expand Down
17 changes: 14 additions & 3 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1072,12 +1072,23 @@ configs:
groups:
- name: End to end monitoring
rules:
- alert: HighTestRuntime
expr: min(tests_test_runtime_seconds{test="site"})>=10 and max(tests_test_runtime_seconds{test="site"})>=30
- alert: HighTestRuntimeSite
# when site probes for 2 or more of the test domains take longer than 30 seconds something is wrong
expr: count(tests_test_runtime_seconds{test="site"} >= 30) >= 2
annotations:
host: $INTERNETNL_DOMAINNAME
summary: Tests/probes take longer to complete than expected
summary: Two or more tests for web take longer to complete than expected
dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'
- alert: HighTestRuntimeMail
# when mail probes for 2 or more of the test domains take longer than 30 seconds something is wrong
# minez.nl and forumstandaardisatie.nl currently time out on mail test tls probe, skipping for now, should be solved when switching to sslyze
expr: count(tests_test_runtime_seconds{test="site", domain!~"minez.nl|forumstandaardisatie.nl"} >= 30) >= 2
annotations:
host: $INTERNETNL_DOMAINNAME
summary: Two or more tests for mail take longer to complete than expected
dashboard: 'https://$INTERNETNL_DOMAINNAME/grafana/d/af7d1d82-c0f9-4d8d-bc03-542c4c4c75c0/periodic-tests'
alertmanager_config:
content: |
global:
Expand Down
Loading

0 comments on commit 629b5fb

Please sign in to comment.