diff --git a/requirements-minimal.txt b/requirements-minimal.txt index 86a6425ae..71bbdd433 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -10,6 +10,7 @@ ipython Jinja2>=3.1.2 lockfile moto +prometheus-client==0.20.0 psutil py-bcrypt pyasn1 diff --git a/requirements.txt b/requirements.txt index 4ffb5b8d6..71910567a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,6 +51,7 @@ oauthlib==3.1.0 parso==0.7.0 pexpect==4.7.0 pickleshare==0.7.5 +prometheus-client==0.20.0 prompt-toolkit==3.0.38 psutil==5.6.6 ptyprocess==0.6.0 diff --git a/tests/api/resource_test.py b/tests/api/resource_test.py index d05a686df..9f5e64fa4 100644 --- a/tests/api/resource_test.py +++ b/tests/api/resource_test.py @@ -169,6 +169,7 @@ def test__init__(self): b"metrics", b"status", b"events", + b"prom-metrics", b"", ] assert set(expected_children) == set(self.resource.children) diff --git a/tron/api/resource.py b/tron/api/resource.py index fd6574dd4..d923e5b05 100644 --- a/tron/api/resource.py +++ b/tron/api/resource.py @@ -8,6 +8,7 @@ import traceback import staticconf +from prometheus_client.twisted import MetricsResource as MetricsResourceProm from tron.config.static_config import get_config_watcher from tron.config.static_config import NAMESPACE @@ -500,6 +501,7 @@ def __init__(self, mcp): self.putChild(b"status", StatusResource(mcp)) self.putChild(b"events", EventsResource()) self.putChild(b"metrics", MetricsResource()) + self.putChild(b"prom-metrics", MetricsResourceProm()) self.putChild(b"", self) @AsyncResource.bounded diff --git a/tron/kubernetes.py b/tron/kubernetes.py index 013a321ad..8f07f93b3 100644 --- a/tron/kubernetes.py +++ b/tron/kubernetes.py @@ -15,6 +15,7 @@ from twisted.internet.defer import logError import tron.metrics as metrics +import tron.prom_metrics as prom_metrics from tron import __version__ from tron.actioncommand import ActionCommand from tron.config.schema import ConfigFieldSelectorSource @@ -90,7 +91,13 @@ def report_resources(self, decrement: bool = False) -> None: Update internal resource utilization statistics of all tronjobs running for this task's Tron master. """ # TODO(TRON-1612): these should eventually be Prometheus metrics + # these should be replaced with gauges in prometheus multiplier = -1 if decrement else 1 + # prometheus gauges + prom_metrics.tron_cpu_gauge.inc(self.task_config.cpus * multiplier) + prom_metrics.tron_memory_gauge.inc(self.task_config.memory * multiplier) + prom_metrics.tron_disk_gauge.inc(self.task_config.disk * multiplier) + metrics.count("tron.mesos.cpus", self.task_config.cpus * multiplier) metrics.count("tron.mesos.mem", self.task_config.memory * multiplier) metrics.count("tron.mesos.disk", self.task_config.disk * multiplier) diff --git a/tron/prom_metrics.py b/tron/prom_metrics.py new file mode 100644 index 000000000..ae7310fa7 --- /dev/null +++ b/tron/prom_metrics.py @@ -0,0 +1,6 @@ +from prometheus_client import Gauge + + +tron_cpu_gauge = Gauge("tron_k8s_cpus", "Measuring CPU for tron jobs on K8s") +tron_memory_gauge = Gauge("tron_k8s_mem", "Measuring memory for tron jobs on K8s") +tron_disk_gauge = Gauge("tron_k8s_disk", "Measuring disk for tron jobs on K8s")