-
Notifications
You must be signed in to change notification settings - Fork 241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding containerd compatability to oom_logger - COMPINFRA-3947 #3904
Changes from all commits
a978a3b
070a761
ba53ddb
52b62c8
16d092d
2173227
aa1df40
8a9f0d1
e609b71
07dde6d
b79ffdf
3418ca2
cbd610f
048a1a8
0ba8fba
b6efc9f
c72a4ed
c4ffc3f
447f321
0e2f1db
47ed5a4
3f6395b
0818402
aa108c1
a9299bd
96ce384
14a16bb
4132b65
f3c13bc
416f9b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,10 +33,16 @@ | |
destination(paasta_oom_logger); | ||
}; | ||
""" | ||
import argparse | ||
import json | ||
import re | ||
import sys | ||
from collections import namedtuple | ||
from typing import Dict | ||
|
||
import grpc | ||
from containerd.services.containers.v1 import containers_pb2 | ||
from containerd.services.containers.v1 import containers_pb2_grpc | ||
from docker.errors import APIError | ||
|
||
from paasta_tools.cli.utils import get_instance_config | ||
|
@@ -76,6 +82,16 @@ | |
) | ||
|
||
|
||
def parse_args() -> argparse.Namespace: | ||
parser = argparse.ArgumentParser(description="paasta_oom_logger") | ||
parser.add_argument( | ||
"--containerd", | ||
action="store_true", | ||
help="Use containerd to inspect containers, otherwise use docker", | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def capture_oom_events_from_stdin(): | ||
process_name_regex = re.compile( | ||
r"^\d+\s[a-zA-Z0-9\-]+\s.*\]\s(.+)\sinvoked\soom-killer:" | ||
|
@@ -93,6 +109,25 @@ def capture_oom_events_from_stdin(): | |
""", | ||
re.VERBOSE, | ||
) | ||
oom_regex_kubernetes_containerd_systemd_cgroup = re.compile( | ||
r""" | ||
^(\d+)\s # timestamp | ||
([a-zA-Z0-9\-]+) # hostname | ||
\s.*oom-kill:.*task_memcg=/.*\.slice/.* # loosely match systemd slice and containerid | ||
cri-containerd:(\w{64}).*$ # containerid | ||
""", | ||
re.VERBOSE, | ||
) | ||
|
||
oom_regex_kubernetes_containerd_systemd_cgroup_structured = re.compile( | ||
r""" | ||
^(\d+)\s # timestamp | ||
([a-zA-Z0-9\-]+) # hostname | ||
\s.*oom-kill:.*task_memcg=/kubepods\.slice/.* # match systemd slice and containerid | ||
cri-containerd-(\w{64}).*$ # containerid | ||
""", | ||
re.VERBOSE, | ||
) | ||
oom_regex_kubernetes_structured = re.compile( | ||
r""" | ||
^(\d+)\s # timestamp | ||
|
@@ -115,6 +150,8 @@ def capture_oom_events_from_stdin(): | |
oom_regex_kubernetes, | ||
oom_regex_kubernetes_structured, | ||
oom_regex_kubernetes_systemd_cgroup, | ||
oom_regex_kubernetes_containerd_systemd_cgroup, | ||
oom_regex_kubernetes_containerd_systemd_cgroup_structured, | ||
] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at some point we should probably audit all of these + remove unused ones + comment the remaining ones from chatting in slack, it seems like we're not quite sure yet why sometimes we're only seeing the structured oomkill loglines and other times the unstructured one - but it does seem a bit weird that we can't use a single regex for containerd pods (or that in the past we needed 3 regexes for dockershim pods!) |
||
|
||
process_name = "" | ||
|
@@ -136,11 +173,18 @@ def capture_oom_events_from_stdin(): | |
break | ||
|
||
|
||
def get_container_env_as_dict(docker_inspect): | ||
def get_container_env_as_dict( | ||
is_cri_containerd: bool, container_inspect: dict | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be nice to have some TypedDicts for the container_inspect dict - but that'll be a little annoying since you'd need a Union of two TypedDicts and you'd need to add some casts in the the code below :p |
||
) -> Dict[str, str]: | ||
env_vars = {} | ||
config = docker_inspect.get("Config") | ||
if is_cri_containerd: | ||
config = container_inspect.get("process") | ||
env_key = "env" | ||
else: | ||
config = container_inspect.get("Config") | ||
env_key = "Env" | ||
if config is not None: | ||
env = config.get("Env", []) | ||
env = config.get(env_key, []) | ||
for i in env: | ||
name, _, value = i.partition("=") | ||
env_vars[name] = value | ||
|
@@ -209,18 +253,26 @@ def send_sfx_event(service, instance, cluster): | |
counter.count() | ||
|
||
|
||
def get_containerd_container(container_id: str) -> containers_pb2.Container: | ||
with grpc.insecure_channel("unix:///run/containerd/containerd.sock") as channel: | ||
containersv1 = containers_pb2_grpc.ContainersStub(channel) | ||
return containersv1.Get( | ||
containers_pb2.GetContainerRequest(id=container_id), | ||
metadata=(("containerd-namespace", "k8s.io"),), | ||
).container | ||
|
||
|
||
def main(): | ||
if clog is None: | ||
print("CLog logger unavailable, exiting.", file=sys.stderr) | ||
sys.exit(1) | ||
|
||
args = parse_args() | ||
clog.config.configure( | ||
scribe_host="169.254.255.254", | ||
scribe_port=1463, | ||
monk_disable=False, | ||
scribe_disable=False, | ||
) | ||
|
||
cluster = load_system_paasta_config().get_cluster() | ||
client = get_docker_client() | ||
for ( | ||
|
@@ -229,11 +281,22 @@ def main(): | |
container_id, | ||
process_name, | ||
) in capture_oom_events_from_stdin(): | ||
try: | ||
docker_inspect = client.inspect_container(resource_id=container_id) | ||
except (APIError): | ||
continue | ||
env_vars = get_container_env_as_dict(docker_inspect) | ||
if args.containerd: | ||
# then we're using containerd to inspect containers | ||
try: | ||
container_info = get_containerd_container(container_id) | ||
except grpc.RpcError as e: | ||
print("An error occurred while getting the container:", e) | ||
continue | ||
container_spec_raw = container_info.spec.value.decode("utf-8") | ||
container_inspect = json.loads(container_spec_raw) | ||
else: | ||
# we're using docker to inspect containers | ||
try: | ||
container_inspect = client.inspect_container(resource_id=container_id) | ||
except (APIError): | ||
continue | ||
jfongatyelp marked this conversation as resolved.
Show resolved
Hide resolved
|
||
env_vars = get_container_env_as_dict(args.containerd, container_inspect) | ||
service = env_vars.get("PAASTA_SERVICE", "unknown") | ||
instance = env_vars.get("PAASTA_INSTANCE", "unknown") | ||
mesos_container_id = env_vars.get("MESOS_CONTAINER_NAME", "mesos-null") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we'll need to do some experimentation w/ an actual oomkilled pod on nodes w/ containerd to verify what format the logs look like, and if we need to update any of these regexes.