Skip to content

Commit

Permalink
Merge pull request #162 from CogStack/metrics-async
Browse files Browse the repository at this point in the history
Metrics async - changes list in the comment above
  • Loading branch information
tomolopolis authored Oct 24, 2023
2 parents 99c4f5e + f3b00e5 commit 81d7a23
Show file tree
Hide file tree
Showing 16 changed files with 461 additions and 7,554 deletions.
9 changes: 9 additions & 0 deletions webapp/api/api/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,12 @@ class ExportedProjectAdmin(admin.ModelAdmin):


admin.site.register(ExportedProject, ExportedProjectAdmin)


class ProjectMetricsAdmin(admin.ModelAdmin):
model = ProjectMetrics
list_display = ('report_name', 'report_name_generated')
list_filter = ['projects']


admin.site.register(ProjectMetrics, ProjectMetricsAdmin)
39 changes: 39 additions & 0 deletions webapp/api/api/metrics.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,64 @@
import json
import logging
import math
import os
import warnings
from collections import Counter
from typing import List, Dict
from background_task import background

import numpy as np
import pandas as pd
import torch
from background_task.models import Task
from django.contrib.auth.models import User
from django.db.models import QuerySet
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.meta_cat import MetaCAT
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase
from medcat.utils.meta_cat.data_utils import prepare_from_json, encode_category_values
from medcat.utils.meta_cat.ml_utils import create_batch_piped_data
from medcat.vocab import Vocab
from torch import nn

from api.admin import retrieve_project_data
from api.models import ProjectAnnotateEntities, ProjectMetrics as AppProjectMetrics
from core.settings import MEDIA_ROOT

_dt_fmt = '%Y-%m-%d %H:%M:%S.%f'

logger = logging.getLogger(__name__)


@background(schedule=1, queue='metrics')
def calculate_metrics(project_ids: List[int], report_name: str):
"""
Computes metrics in a background task
:param projects: list of projects to compute metrics for. Uses the 'first' for the CDB, but
should be the same CDB, but will still try and compute metrics regardless
:return: computed metrics results
"""
logger.info('Calculating metrics for report: %s', report_name)
projects = [ProjectAnnotateEntities.objects.filter(id=p_id).first() for p_id in project_ids]
cdb = CDB.load(projects[0].concept_db.cdb_file.path)
vocab = Vocab.load(projects[0].vocab.vocab_file.path)
cat = CAT(cdb, vocab, config=cdb.config)
project_data = retrieve_project_data(projects)
metrics = ProjectMetrics(project_data, cat)
report = metrics.generate_report()
report_file_path = f'{MEDIA_ROOT}/{report_name}.json'
json.dump(report, open(report_file_path, 'w'))
apm = AppProjectMetrics()
apm.report_name_generated = report_name
apm.report.name = report_file_path
apm.save()
apm.projects.set(projects)
logger.info('Finished calculating metrics for report: %s, saved results in ProjectMetrics(id=%s)',
report_name, apm.id)


class ProjectMetrics(object):
"""
Class to analyse MedCATtrainer exports
Expand Down
26 changes: 26 additions & 0 deletions webapp/api/api/migrations/0073_auto_20231022_0028.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 2.2.28 on 2023-10-22 00:28

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0072_delete_projectcuicounter'),
]

operations = [
migrations.CreateModel(
name='ProjectMetrics',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('report_name_generated', models.TextField(help_text='report name that links this metrics report to a previously ran bg task')),
('report_name', models.TextField(help_text='A user specified report name that should be more user friendly than the generated one')),
('report', models.FileField(help_text='the outputted metrics for configured', upload_to='')),
('projects', models.ManyToManyField(blank=True, null=True, to='api.ProjectAnnotateEntities')),
],
),
migrations.DeleteModel(
name='MedCATModel',
),
]
21 changes: 12 additions & 9 deletions webapp/api/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,6 @@ def __str__(self):
return str(self.vocab_file.name)


class MedCATModel(models.Model):
name = models.CharField(max_length=100)
cdb = models.ForeignKey('ConceptDB', on_delete=models.CASCADE)
vocab = models.ForeignKey('Vocabulary', on_delete=models.CASCADE)

def __str__(self):
return str(self.name)


class Dataset(models.Model):
name = models.CharField(max_length=150)
original_file = models.FileField()
Expand Down Expand Up @@ -287,6 +278,18 @@ def __str__(self):
return self.trainer_export_file.name


class ProjectMetrics(models.Model):
report_name_generated = models.TextField(help_text='report name that links this metrics report to a previously '
'ran bg task')
report_name = models.TextField(help_text='A user specified report name that should be more user friendly than '
'the generated one')
report = models.FileField(help_text='the outputted metrics for configured')
projects = models.ManyToManyField('ProjectAnnotateEntities', null=True, blank=True)

def __str__(self):
return f'generated report name: {self.report_name}, user specified report name:f{self.report_name}'


@receiver(models.signals.post_delete, sender=ConceptDB)
def auto_delete_cdb_file_on_delete(sender, instance, **kwargs):
_remove_file(instance, 'cdb_file')
Expand Down
123 changes: 102 additions & 21 deletions webapp/api/api/views.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import logging
import pickle
import traceback
from datetime import datetime
from tempfile import NamedTemporaryFile

from background_task.models import Task, CompletedTask
from django.http import HttpResponseBadRequest, HttpResponseServerError, HttpResponse
from django.shortcuts import render
from django.utils import timezone
from django_filters import rest_framework as drf
from medcat.cdb import CDB
from medcat.utils.helpers import tkns_from_doc
Expand All @@ -16,7 +19,7 @@
from .admin import download_projects_with_text, download_projects_without_text, \
import_concepts_from_cdb, upload_projects_export, retrieve_project_data
from .medcat_utils import ch2pt_from_pt2ch, get_all_ch, dedupe_preserve_order, snomed_ct_concept_path
from .metrics import ProjectMetrics
from .metrics import calculate_metrics
from .permissions import *
from .serializers import *
from .solr_utils import collections_available, search_collection, ensure_concept_searchable
Expand Down Expand Up @@ -633,28 +636,106 @@ def model_loaded(_):
for p in ProjectAnnotateEntities.objects.all()})


@api_view(http_method_names=['GET'])
def metrics(request):
p_ids = request.GET.get('projectIds').split(',')
projects = ProjectAnnotateEntities.objects.filter(id__in=p_ids)
@api_view(http_method_names=['GET', 'POST'])
def metrics_jobs(request):
dt_fmt = '%Y-%m-%d %H:%M:%S'
if request.method == 'GET':
running_metrics_tasks_qs = Task.objects.filter(queue='metrics')
completed_metrics_tasks = CompletedTask.objects.filter(queue='metrics')

def serialize_task(task, state):
return {
'report_id': task.id,
'report_name_generated': task.verbose_name,
'projects': task.verbose_name.split('-')[1].split(','),
'created_user': task.creator.username,
'create_time': task.run_at.strftime(dt_fmt),
'status': state
}
running_reports = [serialize_task(t, 'running') for t in running_metrics_tasks_qs]
for r, t in zip(running_reports, running_metrics_tasks_qs):
if t.locked_by is None and t.locked_by_pid_running() is None:
r['status'] = 'pending'

comp_reports = [serialize_task(t, 'complete') for t in completed_metrics_tasks]
for comp_task, comp_rep in zip(completed_metrics_tasks, comp_reports):
pm_obj = ProjectMetrics.objects.filter(report_name_generated=comp_task.verbose_name).first()
if pm_obj is not None and pm_obj.report_name is not None:
comp_rep['report_name'] = pm_obj.report_name
reports = running_reports + comp_reports
return Response({'reports': reports})
elif request.method == 'POST':
now = timezone.now()
user = request.user
p_ids = request.data.get('projectIds').split(',')
projects = ProjectAnnotateEntities.objects.filter(id__in=p_ids)

# provide warning of inconsistent models used or for models that are not loaded.
p_cdbs = set(p.concept_db for p in projects)
if len(p_cdbs) > 1:
logger.warning('Inconsistent CDBs used in the generation of metrics - should use the same CDB for '
f'consistent results - found {[cdb.name for cdb in p_cdbs]} - metrics will only use the first'
f' CDB {projects[0].concept_db.name}')

report_name = f'metrics-{"_".join(p_ids)}-{now.strftime(dt_fmt)}'
submitted_job = calculate_metrics([p.id for p in projects],
verbose_name=report_name,
creator=user,
report_name=report_name)
return Response({'metrics_job_id': submitted_job.id, 'metrics_job_name': report_name})


@api_view(http_method_names=['DELETE'])
def remove_metrics_job(request, report_id: int):
running_metrics_tasks_qs = {t.id: t for t in Task.objects.filter(queue='metrics')}
completed_metrics_tasks = {t.id: t for t in CompletedTask.objects.filter(queue='metrics')}
if report_id in running_metrics_tasks_qs:
# remove completed task and associated report
task = running_metrics_tasks_qs[report_id]
if task.locked_by and task.locked_by_pid_running():
logger.info('Will not kill running process - report ID: %s', report_id)
return Response(503, 'Unable to remove a running metrics report job. Please wait until it '
'completes then remove.')
else:
logger.info('Metrics job deleted - report ID: %s', report_id)
elif report_id in completed_metrics_tasks:
task = completed_metrics_tasks[report_id]
try:
pm = ProjectMetrics.objects.filter(report_name_generated=task.verbose_name).first()
if os.path.isfile(pm.report.path):
os.remove(pm.report.path)
pm.delete()
except Exception as e:
pass
task.delete()
logger.info('Completed metrics job deleted - report ID: %s', report_id)
return Response(200, 'task / report deleted')

# provide warning of inconsistent models used or for models that are not loaded.
p_cdbs = set(p.concept_db for p in projects)
if len(p_cdbs) > 1:
logger.warning('Inconsistent CDBs used in the generation of metrics - should use the same CDB for '
f'consistent results - found {[cdb.name for cdb in p_cdbs]} - metrics will only use the first'
f' CDB {projects[0].concept_db.name}')
for p_cdb in p_cdbs:
if p_cdb not in CDB_MAP:
logger.warning(f'CDB {p_cdb.name} not in CDB_MAP cache - this will now be loaded - '
f'and will not show intermediary training status')

cat = get_medcat(CDB_MAP=CDB_MAP, VOCAB_MAP=VOCAB_MAP,
CAT_MAP=CAT_MAP, project=projects[0])
project_data = retrieve_project_data(projects)
metrics = ProjectMetrics(project_data, cat)
report_output = metrics.generate_report()
return Response({'results': report_output})
@api_view(http_method_names=['GET', 'PUT'])
def view_metrics(request, report_id):
if request.method == 'GET':
running_pending_report = Task.objects.filter(id=report_id, queue='metrics').first()
completed_report = CompletedTask.objects.filter(id=report_id, queue='metrics').first()
if running_pending_report is None and completed_report is None:
HttpResponseBadRequest(f'Cannot find report_id:{report_id} in either pending, running or complete report lists. ')
elif running_pending_report is not None:
HttpResponseBadRequest(f'Cannot view a running or pending metrics report with id:{report_id}')
pm_obj = ProjectMetrics.objects.filter(report_name_generated=completed_report.verbose_name).first()
out = {
'results': {
'report_name': pm_obj.report_name,
'report_name_generated': pm_obj.report_name_generated,
**json.load(open(pm_obj.report.path))
}
}
return Response(out)
elif request.method == 'PUT':
completed_report = CompletedTask.objects.filter(id=report_id, queue='metrics').first()
pm_obj = ProjectMetrics.objects.filter(report_name_generated=completed_report.verbose_name).first()
pm_obj.report_name = request.data.get('report_name')
pm_obj.save()
return Response(200)


@api_view(http_method_names=['GET'])
Expand Down
6 changes: 5 additions & 1 deletion webapp/api/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@
]

# BG TASKS
MAX_RUN_TIME=60*60*10
MAX_ATTEMPTS = 1
MAX_RUN_TIME = 60*60*10
BACKGROUND_TASK_RUN_ASYNC = True
BACKGROUND_TASK_ASYNC_THREADS = 4


# Solr Concept Search settings
SOLR_HOST = os.environ.get('CONCEPT_SEARCH_SERVICE_HOST', 'solr')
Expand Down
4 changes: 3 additions & 1 deletion webapp/api/core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@
path('api/cache-model/<int:cdb_id>/', api.views.cache_model),
path('api/upload-deployment/', api.views.upload_deployment),
path('api/model-concept-children/<int:cdb_id>/', api.views.cdb_cui_children),
path('api/metrics/', api.views.metrics),
path('api/metrics/<int:report_id>/', api.views.view_metrics),
path('api/metrics-job/', api.views.metrics_jobs),
path('api/metrics-job/<int:report_id>/', api.views.remove_metrics_job),
path('api/concept-path/', api.views.cdb_concept_path),
path('api/generate-concept-filter-json/', api.views.generate_concept_filter_flat_json),
path('api/generate-concept-filter/', api.views.generate_concept_filter),
Expand Down
Loading

0 comments on commit 81d7a23

Please sign in to comment.