Skip to content

Commit

Permalink
Adding node_events from the slurm database
Browse files Browse the repository at this point in the history
  • Loading branch information
guilbaults committed Jan 12, 2024
1 parent 456e8ec commit 2b00b29
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 1 deletion.
25 changes: 25 additions & 0 deletions jobstats/templates/jobstats/job.html
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,31 @@ <h2>{% translate "Scheduler info" %}</h2>
</tr>
</tbody>
</table>

{% if node_events %}
<h2>{% translate "Node events" %}</h2>
<table class="table table-striped">
<thead class="thead-dark">
<tr>
<th scope="col">{% translate "Node" %}</th>
<th scope="col">{% translate "Start time" %}</th>
<th scope="col">{% translate "End time" %}</th>
<th scope="col">{% translate "Reason" %}</th>
</tr>
</thead>
<tbody>
{% for node_event in node_events %}
<tr>
<td>{{node_event.node_name}}</td>
<td><span data-toggle="tooltip" data-placement="top" title="{{node_event.time_start_dt}}">{{node_event.time_start_dt | naturaltime}} <span data-feather="info"></span></span></td>
<td><span data-toggle="tooltip" data-placement="top" title="{{node_event.time_end_dt}}">{{node_event.time_end_dt | naturaltime}} <span data-feather="info"></span></span></td>
<td>{{node_event.reason}}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}

{% endif %}

{% if multiple_jobs is False %}
Expand Down
23 changes: 22 additions & 1 deletion jobstats/views.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.shortcuts import render, redirect
from django.http import HttpResponseNotFound, JsonResponse
from slurm.models import JobTable, AssocTable
from slurm.models import JobTable, AssocTable, EventTable
from userportal.common import user_or_staff, username_to_uid, Prometheus, request_to_username, compute_allocations_by_user, get_step, parse_start_end, fixed_zoom_config
from django.conf import settings
from datetime import datetime, timedelta
Expand Down Expand Up @@ -446,6 +446,27 @@ def job(request, username, job_id):
for graph in comment.graph_ids:
context['graph_div'][graph] = comment.display_card_class()

context['node_events'] = []
try:
# only completed jobs seem to have events
# gather events that occured on the nodes of the job, 1 hour before and after
start = job.time_start - 3600
end = job.time_end + 3600

started = EventTable.objects\
.filter(node_name__in=job.nodes())\
.filter(time_start__gte=start)\
.filter(time_start__lte=end).all()

ended = EventTable.objects\
.filter(node_name__in=job.nodes())\
.filter(time_end__gte=start)\
.filter(time_end__lte=end).all()

context['node_events'] = started | ended
except IndexError:
pass

# export some settings to the template
context['CLOUD_CPU_CORE_COST_PER_HOUR'] = settings.CLOUD_CPU_CORE_COST_PER_HOUR
context['CLOUD_GPU_COST_PER_HOUR'] = settings.CLOUD_GPU_COST_PER_HOUR
Expand Down
24 changes: 24 additions & 0 deletions nodes/templates/nodes/node.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,30 @@
{% block content %}
<h1>{% translate "Node" %} {{node}}</h1>

{% if node_events %}
<h2>{% translate "Node events" %}</h2>
<table class="table table-striped">
<thead class="thead-dark">
<tr>
<th scope="col">{% translate "Node" %}</th>
<th scope="col">{% translate "Start time" %}</th>
<th scope="col">{% translate "End time" %}</th>
<th scope="col">{% translate "Reason" %}</th>
</tr>
</thead>
<tbody>
{% for node_event in node_events %}
<tr>
<td>{{node_event.node_name}}</td>
<td><span data-toggle="tooltip" data-placement="top" title="{{node_event.time_start_dt}}">{{node_event.time_start_dt | naturaltime}} <span data-feather="info"></span></span></td>
<td><span data-toggle="tooltip" data-placement="top" title="{{node_event.time_end_dt}}">{{node_event.time_end_dt | naturaltime}} <span data-feather="info"></span></span></td>
<td>{{node_event.reason}}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}

<h2>{% translate "Gantt" %}</h2>
{% if gpu %}
<h3>{% translate "Gpus" %}</h3>
Expand Down
20 changes: 20 additions & 0 deletions nodes/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime, timedelta
from django.utils.translation import gettext as _
from jobstats.views import GPU_MEMORY, GPU_SHORT_NAME, GPU_IDLE_POWER, GPU_FULL_POWER
from slurm.models import EventTable


prom = Prometheus(settings.PROMETHEUS)
Expand Down Expand Up @@ -147,6 +148,25 @@ def node(request, node):
stats_gpu = prom.query_prometheus_multiple(query_gpu, START, END)
context['gpu'] = len(stats_gpu) > 0

context['node_events'] = []
try:
start = START.timestamp()
end = END.timestamp()

started = EventTable.objects\
.filter(node_name=node)\
.filter(time_start__gte=start)\
.filter(time_start__lte=end).all()

ended = EventTable.objects\
.filter(node_name=node)\
.filter(time_end__gte=start)\
.filter(time_end__lte=end).all()

context['node_events'] = started | ended
except IndexError:
pass

return render(request, 'nodes/node.html', context)


Expand Down
10 changes: 10 additions & 0 deletions slurm/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,16 @@ class Meta:
db_table = settings.CLUSTER_NAME + '_event_table'
unique_together = (('node_name', 'time_start'),)

def time_start_dt(self):
if self.time_start == 0:
return None
return datetime.datetime.fromtimestamp(self.time_start)

def time_end_dt(self):
if self.time_end == 0:
return None
return datetime.datetime.fromtimestamp(self.time_end)


class JobTable(models.Model):
class StatesJob(models.IntegerChoices):
Expand Down

0 comments on commit 2b00b29

Please sign in to comment.