-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from ap-wtioit/master-enable_healthcheck_github
add healthcheck support for http and smtp
Showing
8 changed files
with
972 additions
and
308 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import logging | ||
import os | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger("healthcheck") | ||
|
||
|
||
def error(message, exception=None): | ||
logger.error(message) | ||
if exception is None: | ||
exit(1) | ||
else: | ||
raise exception | ||
|
||
|
||
def http_healthcheck(): | ||
""" | ||
Use pycurl to check if the target server is still responding via proxy.py | ||
:return: None | ||
""" | ||
import re | ||
|
||
import pycurl | ||
|
||
check_url = os.environ.get("HTTP_HEALTHCHECK_URL", "http://localhost/") | ||
check_timeout_ms = int(os.environ.get("HTTP_HEALTHCHECK_TIMEOUT_MS", 2000)) | ||
target = os.environ.get("TARGET", "localhost") | ||
check_url_with_target = check_url.replace("$TARGET", target) | ||
port = re.search("https?://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "80" | ||
print("checking %s via 127.0.0.1" % check_url_with_target) | ||
logger.info("checking %s via 127.0.0.1" % check_url_with_target) | ||
try: | ||
request = pycurl.Curl() | ||
request.setopt(pycurl.URL, check_url_with_target) | ||
# do not send the request to the target directly but use our own socat proxy process to check if it's still | ||
# working | ||
request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)]) | ||
request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms) | ||
request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms) | ||
request.perform() | ||
request.close() | ||
except pycurl.error as e: | ||
error("error while checking http connection", e) | ||
|
||
|
||
def smtp_healthcheck(): | ||
""" | ||
Use pycurl to check if the target server is still responding via proxy.py | ||
:return: None | ||
""" | ||
import re | ||
|
||
import pycurl | ||
|
||
check_url = os.environ.get("SMTP_HEALTHCHECK_URL", "smtp://localhost/") | ||
check_command = os.environ.get("SMTP_HEALTHCHECK_COMMAND", "HELP") | ||
check_timeout_ms = int(os.environ.get("SMTP_HEALTHCHECK_TIMEOUT_MS", 2000)) | ||
target = os.environ.get("TARGET", "localhost") | ||
check_url_with_target = check_url.replace("$TARGET", target) | ||
port = re.search("smtp://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "25" | ||
logger.info("checking %s via 127.0.0.1" % check_url_with_target) | ||
try: | ||
request = pycurl.Curl() | ||
request.setopt(pycurl.URL, check_url_with_target) | ||
request.setopt(pycurl.CUSTOMREQUEST, check_command) | ||
# do not send the request to the target directly but use our own socat proxy process to check if it's still | ||
# working | ||
request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)]) | ||
request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms) | ||
request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms) | ||
request.perform() | ||
request.close() | ||
except pycurl.error as e: | ||
error("error while checking smtp connection", e) | ||
|
||
|
||
def process_healthcheck(): | ||
""" | ||
Check that at least one socat process exists per port and no more than the number of configured max connections | ||
processes exist for each port. | ||
:return: | ||
""" | ||
import subprocess | ||
|
||
ports = os.environ["PORT"].split() | ||
max_connections = int(os.environ["MAX_CONNECTIONS"]) | ||
logger.info( | ||
"checking socat processes for port(s) %s having at least one and less than %d socat processes" | ||
% (ports, max_connections) | ||
) | ||
socat_processes = ( | ||
subprocess.check_output(["sh", "-c", "grep -R socat /proc/[0-9]*/cmdline"]) | ||
.decode("utf-8") | ||
.split("\n") | ||
) | ||
pids = [process.split("/")[2] for process in socat_processes if process] | ||
if len(pids) < len(ports): | ||
# if we have less than the number of ports socat processes we do not need to count processes per port and can | ||
# fail fast | ||
error("Expected at least %d socat processes" % len(ports)) | ||
port_process_count = {port: 0 for port in ports} | ||
for pid in pids: | ||
# foreach socat pid we detect the port it's for by checking the last argument (connect to) that ends with | ||
# :{ip}:{port} for our processes | ||
try: | ||
with open("/proc/%d/cmdline" % int(pid)) as fp: | ||
# arguments in /proc/.../cmdline are split by null bytes | ||
cmd = [part for part in "".join(fp.readlines()).split("\x00") if part] | ||
port = cmd[2].split(":")[-1] | ||
port_process_count[port] = port_process_count[port] + 1 | ||
except FileNotFoundError: | ||
# ignore processes no longer existing (possibly retrieved an answer) | ||
pass | ||
for port in ports: | ||
if port_process_count[port] == 0: | ||
error("Missing socat process(es) for port: %s" % port) | ||
if port_process_count[port] >= max_connections + 1: | ||
error( | ||
"More than %d + 1 socat process(es) for port: %s" | ||
% (max_connections, port) | ||
) | ||
|
||
|
||
def preresolve_healthcheck(): | ||
""" | ||
Check that the pre-resolved ip is still valid now for target | ||
:return: | ||
""" | ||
from tempfile import gettempdir | ||
|
||
load_balancing_dns_fs_flag = os.path.join( | ||
gettempdir(), "load_balancing_dns_detected" | ||
) | ||
if not os.path.exists(load_balancing_dns_fs_flag): | ||
# only run the resolver check if a previous run didn't flag the target as being dns load-balanced | ||
import subprocess | ||
|
||
from dns.resolver import Resolver | ||
|
||
pre_resolved_ips = { | ||
line.split(":")[2] | ||
for line in subprocess.check_output( | ||
["sh", "-c", "grep -R '\\(udp\\|tcp\\)-connect:' /proc/[0-9]*/cmdline"] | ||
) | ||
.decode("utf-8") | ||
.split("\n") | ||
if line | ||
} | ||
resolver = Resolver() | ||
resolver.nameservers = os.environ["NAMESERVERS"].split() | ||
target = os.environ["TARGET"] | ||
resolved_ips = [answer.address for answer in resolver.resolve(target)] | ||
for ip in pre_resolved_ips: | ||
logger.info(f"checking {target} resolves to {ip}") | ||
if ip not in resolved_ips: | ||
resolved_ips_2 = [answer.address for answer in resolver.resolve(target)] | ||
if resolved_ips_2 == resolved_ips: | ||
error( | ||
f"{target} no longer resolves to {ip}, {resolved_ips}, {resolved_ips_2}" | ||
) | ||
else: | ||
resolved_ips_3 = [ | ||
answer.address for answer in resolver.resolve(target) | ||
] | ||
# to make sure we didn't just hit the server switch in dns, we check again before deactivating | ||
# the healthcheck permanently (until the container restarts) | ||
if resolved_ips_3 != resolved_ips_2: | ||
logger.info( | ||
f"{target} seems to be load-balancing with dns ({resolved_ips} != {resolved_ips_2}), " | ||
f"deactivating the resolver healthcheck" | ||
) | ||
with open(f"{load_balancing_dns_fs_flag}", "w") as fp: | ||
fp.write(target) | ||
|
||
|
||
process_healthcheck() | ||
if os.environ["PRE_RESOLVE"] == "1": | ||
preresolve_healthcheck() | ||
if os.environ.get("HTTP_HEALTHCHECK", "0") == "1": | ||
http_healthcheck() | ||
if os.environ.get("SMTP_HEALTHCHECK", "0") == "1": | ||
smtp_healthcheck() |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
version: "3.8" | ||
services: | ||
autoheal: | ||
image: willfarrell/autoheal | ||
restart: unless-stopped | ||
environment: | ||
AUTOHEAL_INTERVAL: 1 | ||
AUTOHEAL_CONTAINER_LABEL: "AUTOHEAL_${COMPOSE_PROJECT_NAME}" | ||
privileged: "${OS_NEEDS_PRIVILEGES_FOR_DOCKER_SOCK:-false}" | ||
volumes: | ||
- /var/run/docker.sock:/var/run/docker.sock | ||
|
||
proxy_preresolve: | ||
build: | ||
dockerfile: Dockerfile | ||
context: .. | ||
labels: | ||
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true" | ||
depends_on: | ||
- target | ||
- autoheal | ||
networks: | ||
default: | ||
aliases: | ||
- target_preresolve.example.com | ||
simulated_outside: | ||
environment: | ||
TARGET: target.example.com | ||
PRE_RESOLVE: 1 | ||
NAMESERVERS: "127.0.0.11" #use local docker nameserver | ||
HTTP_HEALTHCHECK: 1 | ||
HTTP_HEALTHCHECK_TIMEOUT_MS: 200 | ||
healthcheck: | ||
test: ["CMD", "healthcheck"] | ||
interval: 1s | ||
timeout: 1s | ||
retries: 0 | ||
start_period: 1s | ||
restart: unless-stopped | ||
|
||
proxy_without_preresolve: | ||
build: | ||
dockerfile: Dockerfile | ||
context: .. | ||
labels: | ||
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true" | ||
depends_on: | ||
- target | ||
- autoheal | ||
networks: | ||
default: | ||
aliases: | ||
- target_without_preresolve.example.com | ||
simulated_outside: | ||
environment: | ||
TARGET: target.example.com | ||
# use no pre resolving (target gets resolved on every request) | ||
PRE_RESOLVE: 0 | ||
NAMESERVERS: "127.0.0.11" #use local docker nameserver | ||
HTTP_HEALTHCHECK: 1 | ||
HTTP_HEALTHCHECK_TIMEOUT_MS: 200 | ||
healthcheck: | ||
test: ["CMD", "healthcheck"] | ||
interval: 1s | ||
timeout: 1s | ||
retries: 0 | ||
start_period: 1s | ||
restart: unless-stopped | ||
|
||
proxy_smtp: | ||
build: | ||
dockerfile: Dockerfile | ||
context: .. | ||
labels: | ||
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true" | ||
depends_on: | ||
- target_smtp | ||
- autoheal | ||
networks: | ||
default: | ||
aliases: | ||
- target_smtp.example.com | ||
simulated_outside: | ||
environment: | ||
TARGET: smtp.example.com | ||
PORT: 1025 | ||
PRE_RESOLVE: 1 | ||
NAMESERVERS: "127.0.0.11" #use local docker nameserver | ||
SMTP_HEALTHCHECK: 1 | ||
SMTP_HEALTHCHECK_URL: "smtp://$$TARGET:1025/" | ||
# mailhog doesn't support HELP command | ||
SMTP_HEALTHCHECK_COMMAND: "QUIT" | ||
SMTP_HEALTHCHECK_TIMEOUT_MS: 200 | ||
healthcheck: | ||
test: ["CMD", "healthcheck"] | ||
interval: 1s | ||
timeout: 1s | ||
retries: 0 | ||
start_period: 1s | ||
restart: unless-stopped | ||
|
||
target: | ||
image: nginx | ||
networks: | ||
simulated_outside: | ||
aliases: | ||
- target.example.com | ||
|
||
target_smtp: | ||
image: mailhog/mailhog | ||
networks: | ||
simulated_outside: | ||
aliases: | ||
- smtp.example.com | ||
|
||
target_firewalled_not_responding: | ||
image: python:3.9 | ||
volumes: | ||
- ./not_responding_tcp_port.py:/bin/not_responding_tcp_port | ||
command: ["not_responding_tcp_port", "0.0.0.0", "80"] | ||
networks: | ||
simulated_outside: | ||
aliases: | ||
- target.example.com | ||
|
||
target_smtp_firewalled_not_responding: | ||
image: python:3.9 | ||
volumes: | ||
- ./not_responding_tcp_port.py:/bin/not_responding_tcp_port | ||
command: ["not_responding_tcp_port", "0.0.0.0", "25"] | ||
networks: | ||
simulated_outside: | ||
aliases: | ||
- smtp.example.com | ||
|
||
networks: | ||
# we do not allow communication to the outside | ||
simulated_outside: | ||
internal: true | ||
default: | ||
internal: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
this is a server that accepts TCP connections but doesn't send any response. it just closes the connection after an hour | ||
has passed. this is intended for testing timeouts only. | ||
""" | ||
|
||
import socket | ||
import sys | ||
import time | ||
|
||
|
||
def keep_client_waiting(server_socket): | ||
client, address = server_socket.accept() | ||
print("connected") | ||
server_socket.setblocking(0) | ||
time.sleep(3600) | ||
print("waited for an hour") | ||
server_socket.close() | ||
|
||
|
||
def start_server(): | ||
listen_address = sys.argv[1] if len(sys.argv) > 1 else "" | ||
listen_port = int(sys.argv[2]) if len(sys.argv) > 2 else 80 | ||
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
server_socket.bind((listen_address, listen_port)) | ||
server_socket.listen() | ||
keep_client_waiting(server_socket) | ||
|
||
|
||
if __name__ == "__main__": | ||
start_server() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
import hashlib | ||
import json | ||
import logging | ||
import os.path | ||
from datetime import datetime | ||
from time import sleep | ||
|
||
import plumbum.commands.processes | ||
import pytest | ||
from plumbum import TF, local | ||
from plumbum.cmd import docker, docker_compose, which | ||
|
||
HEALTHCHECK_YAML = os.path.abspath("tests/healthcheck.yaml") | ||
|
||
PROXY_TARGET_PAIRS = [ | ||
("proxy_preresolve", "target"), | ||
("proxy_smtp", "target_smtp"), | ||
("proxy_without_preresolve", "target"), | ||
] | ||
|
||
logger = logging.getLogger() | ||
|
||
_healthcheck = docker_compose["-f", HEALTHCHECK_YAML] | ||
_get_container_id = _healthcheck["ps", "-q"] | ||
|
||
|
||
def _get_container_id_and_ip(service_name): | ||
container_id = _get_container_id(service_name).strip() | ||
container_info = json.loads(docker("inspect", container_id)) | ||
return ( | ||
container_id, | ||
container_info[0]["NetworkSettings"]["Networks"][ | ||
"%s_simulated_outside" % local.env["COMPOSE_PROJECT_NAME"] | ||
]["IPAddress"], | ||
) | ||
|
||
|
||
def _new_ip(target): | ||
# we get the container id of the currently running target to be able to force changing ips by scaling up | ||
# and then stopping the old container | ||
old_container_id, old_ip = _get_container_id_and_ip(target) | ||
|
||
# start a second instance of the target | ||
_healthcheck("up", "-d", "--scale", "%s=2" % target, target) | ||
|
||
# stop and remove the old container | ||
docker("stop", old_container_id) | ||
docker("rm", old_container_id) | ||
|
||
# verify that we got a new ip (should not be able to reuse the old one) | ||
new_container_id, new_ip = _get_container_id_and_ip(target) | ||
assert old_container_id != new_container_id | ||
assert old_ip != new_ip | ||
|
||
|
||
def _wait_for(proxy, message, callback, *args): | ||
try: | ||
while message not in callback(*args): | ||
# try again in one second (to not hammer the CPU) | ||
sleep(1) | ||
except Exception: | ||
# add additional infos to any error to make tracing down the error easier | ||
logger.error("failed waiting for '%s'" % message) | ||
logger.error(_healthcheck("logs", "autoheal")) | ||
logger.error(_healthcheck("ps")) | ||
logger.error(_healthcheck("exec", "-T", proxy, "healthcheck", retcode=None)) | ||
raise | ||
|
||
|
||
def _sha256(text): | ||
return hashlib.sha256(str(text).encode("utf-8")).hexdigest() | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def os_needs_privileges(): | ||
if which["getenforce"] & TF: | ||
# if we can find getenforce on the current system, SELinux is probably installed and we need to start | ||
# autoheal with privileges | ||
return "true" | ||
return "false" | ||
|
||
|
||
@pytest.fixture(scope="function", autouse=True) | ||
def _cleanup_docker_compose(tmp_path, os_needs_privileges): | ||
with local.cwd(tmp_path): | ||
custom_compose_project_name = "{}_{}".format( | ||
os.path.basename(tmp_path), _sha256(tmp_path)[:6] | ||
) | ||
with local.env( | ||
COMPOSE_PROJECT_NAME=custom_compose_project_name, | ||
OS_NEEDS_PRIVILEGES_FOR_DOCKER_SOCK=os_needs_privileges, | ||
) as env: | ||
yield env | ||
|
||
# stop autoheal first to prevent it from restarting containers to be stopped | ||
_healthcheck("stop", "autoheal") | ||
_healthcheck("down", "-v") | ||
|
||
|
||
@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS) | ||
def test_healthcheck_ok(proxy, target): | ||
# given a started proxy with healthcheck | ||
_healthcheck("up", "-d", proxy) | ||
|
||
# when everything is ok and target is Up | ||
assert "Up" in _healthcheck("ps", target) | ||
|
||
# then healthcheck should be successful | ||
_healthcheck("exec", "-T", proxy, "healthcheck") | ||
|
||
|
||
@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS) | ||
def test_healthcheck_failing(proxy, target): | ||
# given a started proxy with healthcheck | ||
_healthcheck("up", "-d", proxy) | ||
# and autoheal not interfering | ||
_healthcheck("stop", "autoheal") | ||
|
||
# when target is not reachable | ||
_healthcheck("stop", target) | ||
assert " Exit " in _healthcheck("ps", target) | ||
|
||
# then healthcheck should return an error (non zero exit code) | ||
with pytest.raises( | ||
plumbum.commands.processes.ProcessExecutionError, | ||
match=r"Unexpected exit code: (1|137)", | ||
): | ||
_healthcheck("exec", "-T", proxy, "healthcheck") | ||
|
||
|
||
@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS) | ||
@pytest.mark.timeout(30) | ||
def test_healthcheck_failing_firewalled(proxy, target): | ||
# given a started proxy with healthcheck | ||
_healthcheck("up", "-d", proxy) | ||
# and autoheal not interfering | ||
_healthcheck("stop", "autoheal") | ||
|
||
# when target stops responding | ||
_healthcheck("stop", target) | ||
assert " Exit " in _healthcheck("ps", target) | ||
_healthcheck( | ||
"up", "-d", "{target:s}_firewalled_not_responding".format(target=target) | ||
) | ||
assert "Up" in _healthcheck( | ||
"ps", "{target:s}_firewalled_not_responding".format(target=target) | ||
) | ||
|
||
# then healthcheck should return an error (non zero exit code) | ||
with pytest.raises( | ||
plumbum.commands.processes.ProcessExecutionError, | ||
match=r"Unexpected exit code: (1|137)", | ||
): | ||
start = datetime.now() | ||
_healthcheck("exec", "-T", proxy, "healthcheck") | ||
end = datetime.now() | ||
# timeout is set to 200ms for tests, so the exception should be raised at earliest after 0.2s | ||
# and at most 2s after starting considering overhead | ||
# if it happens outside that timeframe (especially before 0.2s) the exception might hint to another error type | ||
assert 0.2 < (end - start).total_seconds() < 2 | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"proxy,target", | ||
(p for p in PROXY_TARGET_PAIRS if p[0] != "proxy_without_preresolve"), | ||
) | ||
@pytest.mark.timeout(60) | ||
def test_healthcheck_autoheal(proxy, target): | ||
# given a started proxy with healthcheck | ||
_healthcheck("up", "-d", proxy) | ||
proxy_container_id = _get_container_id(proxy).strip() | ||
# that was healthy | ||
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy) | ||
|
||
# when target gets a new ip | ||
_new_ip(target) | ||
|
||
# then autoheal should restart the proxy | ||
_wait_for( | ||
proxy, | ||
"(%s) found to be unhealthy - Restarting container now" | ||
% proxy_container_id[:12], | ||
_healthcheck, | ||
"logs", | ||
"autoheal", | ||
) | ||
|
||
# and the proxy should become healthy | ||
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy) | ||
|
||
# and healthcheck should be successful | ||
_healthcheck("exec", "-T", proxy, "healthcheck") | ||
|
||
|
||
def test_healthcheck_autoheal_proxy_without_preresolve(): | ||
# given a started proxy with healthcheck | ||
proxy = "proxy_without_preresolve" | ||
_healthcheck("up", "-d", proxy) | ||
# that was healthy | ||
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy) | ||
|
||
# when target gets a new ip | ||
_new_ip("target") | ||
|
||
# then healthcheck should be always successful (we wait just for 5 seconds/healthchecks) | ||
for _ in range(0, 5): | ||
_healthcheck("exec", "-T", proxy, "healthcheck") | ||
sleep(1) | ||
|
||
# and autoheal shouldn't have restarted anything | ||
assert not [ | ||
line | ||
for line in _healthcheck("logs", "autoheal").split("\n") | ||
if line and not line.startswith("Attaching to ") | ||
] |