Skip to content

Commit

Permalink
Merge pull request #8 from ap-wtioit/master-enable_healthcheck_github
Browse files Browse the repository at this point in the history
add healthcheck support for http and smtp
pedrobaeza authored Feb 7, 2024
2 parents d2cf614 + 4627f26 commit 118a9ac
Showing 8 changed files with 972 additions and 308 deletions.
14 changes: 11 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
FROM python:3-alpine
ENTRYPOINT ["dumb-init", "--"]
CMD ["proxy"]
RUN apk add --no-cache -t .build build-base &&\
HEALTHCHECK CMD ["healthcheck"]
RUN apk add --no-cache -t .build build-base curl-dev &&\
apk add --no-cache socat &&\
pip install --no-cache-dir dnspython dumb-init &&\
apk add --no-cache libcurl &&\
pip install --no-cache-dir dnspython dumb-init pycurl &&\
apk del .build
ENV NAMESERVERS="208.67.222.222 8.8.8.8 208.67.220.220 8.8.4.4" \
PORT="80 443" \
PRE_RESOLVE=0 \
MODE=tcp \
VERBOSE=0 \
MAX_CONNECTIONS=100 \
UDP_ANSWERS=1
UDP_ANSWERS=1 \
HTTP_HEALTHCHECK=0\
HTTP_HEALTHCHECK_URL="http://\$TARGET/"\
SMTP_HEALTHCHECK=0\
SMTP_HEALTHCHECK_URL="smtp://\$TARGET/"\
SMTP_HEALTHCHECK_COMMAND="HELP"
COPY proxy.py /usr/local/bin/proxy
COPY healthcheck.py /usr/local/bin/healthcheck

# Labels
ARG BUILD_DATE
63 changes: 63 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -35,6 +35,36 @@ Use these environment variables:

Required. It's the host name where the incoming connections will be redirected to.

### `HTTP_HEALTHCHECK`

Default: `0`

Set to `1` to enable healthcheck with pycurl http requests. This is useful if the target
uses a deployment where the ip of the service gets changed frequently (e.g.
`accounts.google.com`) and you are using [`PRE_RESOLVE`](#pre_resolve)

#### Automatically restarting unhealthy proxies

When you enable the http healthcheck the container marks itself as unhealthy but does
nothing. (see https://github.com/moby/moby/pull/22719)

If you want to restart your proxies automatically, you can use
https://github.com/willfarrell/docker-autoheal.

### `HTTP_HEALTHCHECK_URL`

Default: `http://$TARGET/`

Url to use in [`HTTP_HEALTHCHECK`](#http_healthcheck) if enabled. `$TARGET` gets
replaced inside the url by the configured [`TARGET`](#target).

### `HTTP_HEALTHCHECK_TIMEOUT_MS`

Default: `2000`

Timeout in milliseconds for http healthcheck. This is used as a timeout for connecting
and receiving an answer. You may end up with twice the time spend.

### `MODE`

Default: `tcp`
@@ -94,6 +124,39 @@ Set to `1` to force using the specified [nameservers](#nameservers) to resolve t

This is especially useful when using a network alias to whitelist an external API.

### `SMTP_HEALTHCHECK`

Default: `0`

Set to `1` to enable healthcheck with pycurl smtp requests. This is useful if the target
uses a deployment where the ip of the service gets changed frequently (e.g.
`smtp.eu.sparkpostmail.com`) and you are using [`PRE_RESOLVE`](#pre_resolve)

#### Automatically restarting unhealthy proxies

see [HTTP_HEALTHCHECK](#http_healthcheck)

### `SMTP_HEALTHCHECK_URL`

Default: `smtp://$TARGET/`

Url to use in [`SMTP_HEALTHCHECK`](#smtp_healthcheck) if enabled. `$TARGET` gets
replaced inside the url by the configured [`TARGET`](#target).

### `SMTP_HEALTHCHECK_COMMAND`

Default: `HELP`

Enables changing the healthcheck command for servers that do not support `HELP` (e.g.
for [MailHog](https://github.com/mailhog/MailHog) you can use `QUIT`)

### `SMTP_HEALTHCHECK_TIMEOUT_MS`

Default: `2000`

Timeout in milliseconds for smtp healthcheck. This is used as a timeout for connecting
and receiving an answer. You may end up with twice the time spend.

### `UDP_ANSWERS`

Default: `1`
184 changes: 184 additions & 0 deletions healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/usr/bin/env python3

import logging
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("healthcheck")


def error(message, exception=None):
logger.error(message)
if exception is None:
exit(1)
else:
raise exception


def http_healthcheck():
"""
Use pycurl to check if the target server is still responding via proxy.py
:return: None
"""
import re

import pycurl

check_url = os.environ.get("HTTP_HEALTHCHECK_URL", "http://localhost/")
check_timeout_ms = int(os.environ.get("HTTP_HEALTHCHECK_TIMEOUT_MS", 2000))
target = os.environ.get("TARGET", "localhost")
check_url_with_target = check_url.replace("$TARGET", target)
port = re.search("https?://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "80"
print("checking %s via 127.0.0.1" % check_url_with_target)
logger.info("checking %s via 127.0.0.1" % check_url_with_target)
try:
request = pycurl.Curl()
request.setopt(pycurl.URL, check_url_with_target)
# do not send the request to the target directly but use our own socat proxy process to check if it's still
# working
request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)])
request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms)
request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms)
request.perform()
request.close()
except pycurl.error as e:
error("error while checking http connection", e)


def smtp_healthcheck():
"""
Use pycurl to check if the target server is still responding via proxy.py
:return: None
"""
import re

import pycurl

check_url = os.environ.get("SMTP_HEALTHCHECK_URL", "smtp://localhost/")
check_command = os.environ.get("SMTP_HEALTHCHECK_COMMAND", "HELP")
check_timeout_ms = int(os.environ.get("SMTP_HEALTHCHECK_TIMEOUT_MS", 2000))
target = os.environ.get("TARGET", "localhost")
check_url_with_target = check_url.replace("$TARGET", target)
port = re.search("smtp://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "25"
logger.info("checking %s via 127.0.0.1" % check_url_with_target)
try:
request = pycurl.Curl()
request.setopt(pycurl.URL, check_url_with_target)
request.setopt(pycurl.CUSTOMREQUEST, check_command)
# do not send the request to the target directly but use our own socat proxy process to check if it's still
# working
request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)])
request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms)
request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms)
request.perform()
request.close()
except pycurl.error as e:
error("error while checking smtp connection", e)


def process_healthcheck():
"""
Check that at least one socat process exists per port and no more than the number of configured max connections
processes exist for each port.
:return:
"""
import subprocess

ports = os.environ["PORT"].split()
max_connections = int(os.environ["MAX_CONNECTIONS"])
logger.info(
"checking socat processes for port(s) %s having at least one and less than %d socat processes"
% (ports, max_connections)
)
socat_processes = (
subprocess.check_output(["sh", "-c", "grep -R socat /proc/[0-9]*/cmdline"])
.decode("utf-8")
.split("\n")
)
pids = [process.split("/")[2] for process in socat_processes if process]
if len(pids) < len(ports):
# if we have less than the number of ports socat processes we do not need to count processes per port and can
# fail fast
error("Expected at least %d socat processes" % len(ports))
port_process_count = {port: 0 for port in ports}
for pid in pids:
# foreach socat pid we detect the port it's for by checking the last argument (connect to) that ends with
# :{ip}:{port} for our processes
try:
with open("/proc/%d/cmdline" % int(pid)) as fp:
# arguments in /proc/.../cmdline are split by null bytes
cmd = [part for part in "".join(fp.readlines()).split("\x00") if part]
port = cmd[2].split(":")[-1]
port_process_count[port] = port_process_count[port] + 1
except FileNotFoundError:
# ignore processes no longer existing (possibly retrieved an answer)
pass
for port in ports:
if port_process_count[port] == 0:
error("Missing socat process(es) for port: %s" % port)
if port_process_count[port] >= max_connections + 1:
error(
"More than %d + 1 socat process(es) for port: %s"
% (max_connections, port)
)


def preresolve_healthcheck():
"""
Check that the pre-resolved ip is still valid now for target
:return:
"""
from tempfile import gettempdir

load_balancing_dns_fs_flag = os.path.join(
gettempdir(), "load_balancing_dns_detected"
)
if not os.path.exists(load_balancing_dns_fs_flag):
# only run the resolver check if a previous run didn't flag the target as being dns load-balanced
import subprocess

from dns.resolver import Resolver

pre_resolved_ips = {
line.split(":")[2]
for line in subprocess.check_output(
["sh", "-c", "grep -R '\\(udp\\|tcp\\)-connect:' /proc/[0-9]*/cmdline"]
)
.decode("utf-8")
.split("\n")
if line
}
resolver = Resolver()
resolver.nameservers = os.environ["NAMESERVERS"].split()
target = os.environ["TARGET"]
resolved_ips = [answer.address for answer in resolver.resolve(target)]
for ip in pre_resolved_ips:
logger.info(f"checking {target} resolves to {ip}")
if ip not in resolved_ips:
resolved_ips_2 = [answer.address for answer in resolver.resolve(target)]
if resolved_ips_2 == resolved_ips:
error(
f"{target} no longer resolves to {ip}, {resolved_ips}, {resolved_ips_2}"
)
else:
resolved_ips_3 = [
answer.address for answer in resolver.resolve(target)
]
# to make sure we didn't just hit the server switch in dns, we check again before deactivating
# the healthcheck permanently (until the container restarts)
if resolved_ips_3 != resolved_ips_2:
logger.info(
f"{target} seems to be load-balancing with dns ({resolved_ips} != {resolved_ips_2}), "
f"deactivating the resolver healthcheck"
)
with open(f"{load_balancing_dns_fs_flag}", "w") as fp:
fp.write(target)


process_healthcheck()
if os.environ["PRE_RESOLVE"] == "1":
preresolve_healthcheck()
if os.environ.get("HTTP_HEALTHCHECK", "0") == "1":
http_healthcheck()
if os.environ.get("SMTP_HEALTHCHECK", "0") == "1":
smtp_healthcheck()
631 changes: 326 additions & 305 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@ black = {version = "^20.8b1", allow-prereleases = true}
flake8 = "^3.8.4"
plumbum = "^1.6.9"
pytest-xdist = "^2.1.0"
pytest-timeout = "^2.2.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
141 changes: 141 additions & 0 deletions tests/healthcheck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
version: "3.8"
services:
autoheal:
image: willfarrell/autoheal
restart: unless-stopped
environment:
AUTOHEAL_INTERVAL: 1
AUTOHEAL_CONTAINER_LABEL: "AUTOHEAL_${COMPOSE_PROJECT_NAME}"
privileged: "${OS_NEEDS_PRIVILEGES_FOR_DOCKER_SOCK:-false}"
volumes:
- /var/run/docker.sock:/var/run/docker.sock

proxy_preresolve:
build:
dockerfile: Dockerfile
context: ..
labels:
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true"
depends_on:
- target
- autoheal
networks:
default:
aliases:
- target_preresolve.example.com
simulated_outside:
environment:
TARGET: target.example.com
PRE_RESOLVE: 1
NAMESERVERS: "127.0.0.11" #use local docker nameserver
HTTP_HEALTHCHECK: 1
HTTP_HEALTHCHECK_TIMEOUT_MS: 200
healthcheck:
test: ["CMD", "healthcheck"]
interval: 1s
timeout: 1s
retries: 0
start_period: 1s
restart: unless-stopped

proxy_without_preresolve:
build:
dockerfile: Dockerfile
context: ..
labels:
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true"
depends_on:
- target
- autoheal
networks:
default:
aliases:
- target_without_preresolve.example.com
simulated_outside:
environment:
TARGET: target.example.com
# use no pre resolving (target gets resolved on every request)
PRE_RESOLVE: 0
NAMESERVERS: "127.0.0.11" #use local docker nameserver
HTTP_HEALTHCHECK: 1
HTTP_HEALTHCHECK_TIMEOUT_MS: 200
healthcheck:
test: ["CMD", "healthcheck"]
interval: 1s
timeout: 1s
retries: 0
start_period: 1s
restart: unless-stopped

proxy_smtp:
build:
dockerfile: Dockerfile
context: ..
labels:
- "AUTOHEAL_${COMPOSE_PROJECT_NAME}=true"
depends_on:
- target_smtp
- autoheal
networks:
default:
aliases:
- target_smtp.example.com
simulated_outside:
environment:
TARGET: smtp.example.com
PORT: 1025
PRE_RESOLVE: 1
NAMESERVERS: "127.0.0.11" #use local docker nameserver
SMTP_HEALTHCHECK: 1
SMTP_HEALTHCHECK_URL: "smtp://$$TARGET:1025/"
# mailhog doesn't support HELP command
SMTP_HEALTHCHECK_COMMAND: "QUIT"
SMTP_HEALTHCHECK_TIMEOUT_MS: 200
healthcheck:
test: ["CMD", "healthcheck"]
interval: 1s
timeout: 1s
retries: 0
start_period: 1s
restart: unless-stopped

target:
image: nginx
networks:
simulated_outside:
aliases:
- target.example.com

target_smtp:
image: mailhog/mailhog
networks:
simulated_outside:
aliases:
- smtp.example.com

target_firewalled_not_responding:
image: python:3.9
volumes:
- ./not_responding_tcp_port.py:/bin/not_responding_tcp_port
command: ["not_responding_tcp_port", "0.0.0.0", "80"]
networks:
simulated_outside:
aliases:
- target.example.com

target_smtp_firewalled_not_responding:
image: python:3.9
volumes:
- ./not_responding_tcp_port.py:/bin/not_responding_tcp_port
command: ["not_responding_tcp_port", "0.0.0.0", "25"]
networks:
simulated_outside:
aliases:
- smtp.example.com

networks:
# we do not allow communication to the outside
simulated_outside:
internal: true
default:
internal: true
31 changes: 31 additions & 0 deletions tests/not_responding_tcp_port.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3
"""
this is a server that accepts TCP connections but doesn't send any response. it just closes the connection after an hour
has passed. this is intended for testing timeouts only.
"""

import socket
import sys
import time


def keep_client_waiting(server_socket):
client, address = server_socket.accept()
print("connected")
server_socket.setblocking(0)
time.sleep(3600)
print("waited for an hour")
server_socket.close()


def start_server():
listen_address = sys.argv[1] if len(sys.argv) > 1 else ""
listen_port = int(sys.argv[2]) if len(sys.argv) > 2 else 80
server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_socket.bind((listen_address, listen_port))
server_socket.listen()
keep_client_waiting(server_socket)


if __name__ == "__main__":
start_server()
215 changes: 215 additions & 0 deletions tests/test_healtcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import hashlib
import json
import logging
import os.path
from datetime import datetime
from time import sleep

import plumbum.commands.processes
import pytest
from plumbum import TF, local
from plumbum.cmd import docker, docker_compose, which

HEALTHCHECK_YAML = os.path.abspath("tests/healthcheck.yaml")

PROXY_TARGET_PAIRS = [
("proxy_preresolve", "target"),
("proxy_smtp", "target_smtp"),
("proxy_without_preresolve", "target"),
]

logger = logging.getLogger()

_healthcheck = docker_compose["-f", HEALTHCHECK_YAML]
_get_container_id = _healthcheck["ps", "-q"]


def _get_container_id_and_ip(service_name):
container_id = _get_container_id(service_name).strip()
container_info = json.loads(docker("inspect", container_id))
return (
container_id,
container_info[0]["NetworkSettings"]["Networks"][
"%s_simulated_outside" % local.env["COMPOSE_PROJECT_NAME"]
]["IPAddress"],
)


def _new_ip(target):
# we get the container id of the currently running target to be able to force changing ips by scaling up
# and then stopping the old container
old_container_id, old_ip = _get_container_id_and_ip(target)

# start a second instance of the target
_healthcheck("up", "-d", "--scale", "%s=2" % target, target)

# stop and remove the old container
docker("stop", old_container_id)
docker("rm", old_container_id)

# verify that we got a new ip (should not be able to reuse the old one)
new_container_id, new_ip = _get_container_id_and_ip(target)
assert old_container_id != new_container_id
assert old_ip != new_ip


def _wait_for(proxy, message, callback, *args):
try:
while message not in callback(*args):
# try again in one second (to not hammer the CPU)
sleep(1)
except Exception:
# add additional infos to any error to make tracing down the error easier
logger.error("failed waiting for '%s'" % message)
logger.error(_healthcheck("logs", "autoheal"))
logger.error(_healthcheck("ps"))
logger.error(_healthcheck("exec", "-T", proxy, "healthcheck", retcode=None))
raise


def _sha256(text):
return hashlib.sha256(str(text).encode("utf-8")).hexdigest()


@pytest.fixture(scope="session")
def os_needs_privileges():
if which["getenforce"] & TF:
# if we can find getenforce on the current system, SELinux is probably installed and we need to start
# autoheal with privileges
return "true"
return "false"


@pytest.fixture(scope="function", autouse=True)
def _cleanup_docker_compose(tmp_path, os_needs_privileges):
with local.cwd(tmp_path):
custom_compose_project_name = "{}_{}".format(
os.path.basename(tmp_path), _sha256(tmp_path)[:6]
)
with local.env(
COMPOSE_PROJECT_NAME=custom_compose_project_name,
OS_NEEDS_PRIVILEGES_FOR_DOCKER_SOCK=os_needs_privileges,
) as env:
yield env

# stop autoheal first to prevent it from restarting containers to be stopped
_healthcheck("stop", "autoheal")
_healthcheck("down", "-v")


@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS)
def test_healthcheck_ok(proxy, target):
# given a started proxy with healthcheck
_healthcheck("up", "-d", proxy)

# when everything is ok and target is Up
assert "Up" in _healthcheck("ps", target)

# then healthcheck should be successful
_healthcheck("exec", "-T", proxy, "healthcheck")


@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS)
def test_healthcheck_failing(proxy, target):
# given a started proxy with healthcheck
_healthcheck("up", "-d", proxy)
# and autoheal not interfering
_healthcheck("stop", "autoheal")

# when target is not reachable
_healthcheck("stop", target)
assert " Exit " in _healthcheck("ps", target)

# then healthcheck should return an error (non zero exit code)
with pytest.raises(
plumbum.commands.processes.ProcessExecutionError,
match=r"Unexpected exit code: (1|137)",
):
_healthcheck("exec", "-T", proxy, "healthcheck")


@pytest.mark.parametrize("proxy,target", PROXY_TARGET_PAIRS)
@pytest.mark.timeout(30)
def test_healthcheck_failing_firewalled(proxy, target):
# given a started proxy with healthcheck
_healthcheck("up", "-d", proxy)
# and autoheal not interfering
_healthcheck("stop", "autoheal")

# when target stops responding
_healthcheck("stop", target)
assert " Exit " in _healthcheck("ps", target)
_healthcheck(
"up", "-d", "{target:s}_firewalled_not_responding".format(target=target)
)
assert "Up" in _healthcheck(
"ps", "{target:s}_firewalled_not_responding".format(target=target)
)

# then healthcheck should return an error (non zero exit code)
with pytest.raises(
plumbum.commands.processes.ProcessExecutionError,
match=r"Unexpected exit code: (1|137)",
):
start = datetime.now()
_healthcheck("exec", "-T", proxy, "healthcheck")
end = datetime.now()
# timeout is set to 200ms for tests, so the exception should be raised at earliest after 0.2s
# and at most 2s after starting considering overhead
# if it happens outside that timeframe (especially before 0.2s) the exception might hint to another error type
assert 0.2 < (end - start).total_seconds() < 2


@pytest.mark.parametrize(
"proxy,target",
(p for p in PROXY_TARGET_PAIRS if p[0] != "proxy_without_preresolve"),
)
@pytest.mark.timeout(60)
def test_healthcheck_autoheal(proxy, target):
# given a started proxy with healthcheck
_healthcheck("up", "-d", proxy)
proxy_container_id = _get_container_id(proxy).strip()
# that was healthy
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy)

# when target gets a new ip
_new_ip(target)

# then autoheal should restart the proxy
_wait_for(
proxy,
"(%s) found to be unhealthy - Restarting container now"
% proxy_container_id[:12],
_healthcheck,
"logs",
"autoheal",
)

# and the proxy should become healthy
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy)

# and healthcheck should be successful
_healthcheck("exec", "-T", proxy, "healthcheck")


def test_healthcheck_autoheal_proxy_without_preresolve():
# given a started proxy with healthcheck
proxy = "proxy_without_preresolve"
_healthcheck("up", "-d", proxy)
# that was healthy
_wait_for(proxy, "Up (healthy)", _healthcheck, "ps", proxy)

# when target gets a new ip
_new_ip("target")

# then healthcheck should be always successful (we wait just for 5 seconds/healthchecks)
for _ in range(0, 5):
_healthcheck("exec", "-T", proxy, "healthcheck")
sleep(1)

# and autoheal shouldn't have restarted anything
assert not [
line
for line in _healthcheck("logs", "autoheal").split("\n")
if line and not line.startswith("Attaching to ")
]

0 comments on commit 118a9ac

Please sign in to comment.