From 472373728cd616a6fbbd0e0e496e53f4be94d114 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 9 Dec 2024 18:15:18 +0100 Subject: [PATCH 01/61] Move export cache cleaning into the cron jobs --- cvat/apps/dataset_manager/default_settings.py | 2 + cvat/apps/dataset_manager/util.py | 15 +---- cvat/apps/dataset_manager/views.py | 62 ++++++++++++------- cvat/apps/engine/background.py | 3 + cvat/apps/engine/backup.py | 6 +- .../management/commands/syncperiodicjobs.py | 2 + ..._date_project_last_export_date_and_more.py | 28 +++++++++ cvat/apps/engine/models.py | 61 +++++++++++++----- cvat/settings/base.py | 12 ++++ 9 files changed, 139 insertions(+), 52 deletions(-) create mode 100644 cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index a4dd53b0f52e..e313de3d65ac 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -12,3 +12,5 @@ DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" + +EXPORT_CACHE_DIR_NAME = "export_cache" \ No newline at end of file diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 2f1029049bbf..42b514f9764b 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -146,19 +146,6 @@ def get_export_cache_lock( lock.release() -EXPORT_CACHE_DIR_NAME = 'export_cache' - - -def get_export_cache_dir(db_instance: Project | Task | Job) -> str: - base_dir = osp.abspath(db_instance.get_dirname()) - - if osp.isdir(base_dir): - return osp.join(base_dir, EXPORT_CACHE_DIR_NAME) - else: - raise FileNotFoundError( - '{} dir {} does not exist'.format(db_instance.__class__.__name__, base_dir) - ) - def make_export_filename( dst_dir: str, @@ -207,7 +194,7 @@ def parse_export_file_path(file_path: os.PathLike[str]) -> ParsedExportFilename: if not basename_match: raise ValueError(f"Couldn't parse filename components in '{basename}'") - dirname_match = re.search(rf'/(jobs|tasks|projects)/\d+/{EXPORT_CACHE_DIR_NAME}$', dirname) + dirname_match = re.search(rf'/(jobs|tasks|projects)/\d+/{settings.EXPORT_CACHE_DIR_NAME}$', dirname) if not dirname_match: raise ValueError(f"Couldn't parse instance type in '{dirname}'") diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 1dbff55ed08d..a1b38b5af58f 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -14,6 +14,8 @@ from django.conf import settings from django.utils import timezone from rq_scheduler import Scheduler +from pathlib import Path +from contextlib import suppress import cvat.apps.dataset_manager.project as project import cvat.apps.dataset_manager.task as task @@ -25,10 +27,9 @@ from .util import ( LockNotAvailableError, current_function_name, get_export_cache_lock, - get_export_cache_dir, make_export_filename, + make_export_filename, parse_export_file_path ) -from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import slogger = ServerLogManager(__name__) @@ -112,8 +113,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No db_instance = Job.objects.get(pk=job_id) cache_ttl = get_export_cache_ttl(db_instance) - - cache_dir = get_export_cache_dir(db_instance) + cache_dir = db_instance.get_export_cache_directory() # As we're not locking the db object here, it can be updated by the time of actual export. # The file will be saved with the older timestamp. @@ -205,10 +205,10 @@ def export_project_annotations(project_id, dst_format=None, server_url=None): class FileIsBeingUsedError(Exception): pass -def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger) -> None: - # file_ctime is for backward compatibility with older RQ jobs, not needed now - +# TODO: write a migration to delete all clear_export_cache scheduled jobs from scheduler +def clear_export_cache(file_path: str, logger: logging.Logger) -> None: try: + # TODO: update after 8721 with get_export_cache_lock( file_path, block=True, @@ -216,37 +216,57 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger ttl=rq.get_current_job().timeout, ): if not osp.exists(file_path): - raise FileNotFoundError("Export cache file '{}' doesn't exist".format(file_path)) + logger.error("Export cache file '{}' doesn't exist".format(file_path)) parsed_filename = parse_export_file_path(file_path) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - # Need to retry later, the export is in use - _retry_current_rq_job(cache_ttl) logger.info( - "Export cache file '{}' is recently accessed, will retry in {}".format( - file_path, cache_ttl - ) + "Export cache file '{}' is recently accessed".format(file_path) ) - raise FileIsBeingUsedError # should be handled by the worker + raise FileIsBeingUsedError - # TODO: maybe remove all outdated exports os.remove(file_path) - logger.info("Export cache file '{}' successfully removed".format(file_path)) + logger.debug("Export cache file '{}' successfully removed".format(file_path)) except LockNotAvailableError: - # Need to retry later if the lock was not available - _retry_current_rq_job(EXPORT_LOCKED_RETRY_INTERVAL) logger.info( - "Failed to acquire export cache lock. Retrying in {}".format( - EXPORT_LOCKED_RETRY_INTERVAL - ) + "Failed to acquire export cache lock for the file: {file_path}." ) raise except Exception: log_exception(logger) raise + +def cron_job_to_clear_export_cache(Model: str) -> None: + import importlib + assert isinstance(Model, str) + + module_name, Model = Model.rsplit('.', 1) + module = importlib.import_module(module_name) + Model = getattr(module, Model) + + logger = ServerLogManager(__name__).glob + + one_month_ago = timezone.now() - timedelta(days=30) + queryset = Model.objects.filter(last_export_date__gte=one_month_ago) + + for instance in queryset: + try: + export_cache_dir_path = Path(instance.get_export_cache_directory()) + except FileNotFoundError as ex: + logger.warning(str(ex)) + continue + + for child in export_cache_dir_path.iterdir(): + if not child.is_file(): + logger.exception(f'Unexpected file found in export cache: {child.name}') + continue + + with suppress(Exception): + clear_export_cache(child, logger) + def get_export_formats(): return list(EXPORT_FORMATS.values()) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 441d4702014d..e3b7c195fbe8 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -463,6 +463,9 @@ def setup_background_job( db_storage = None result_url = self.make_result_url() + # TODO: move into worker? + self.db_instance.touch_last_export_date() + with get_rq_lock_by_user(queue, user_id): queue.enqueue_call( func=func, diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 499700a3b4ef..1e73424fbd11 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -49,7 +49,7 @@ from cvat.apps.engine.cloud_provider import import_resource_from_cloud_storage from cvat.apps.engine.location import StorageType, get_location_configuration from cvat.apps.engine.permissions import get_cloud_storage_for_import_or_export -from cvat.apps.dataset_manager.views import get_export_cache_dir, log_exception +from cvat.apps.dataset_manager.views import log_exception from cvat.apps.dataset_manager.bindings import CvatImportError slogger = ServerLogManager(__name__) @@ -1015,9 +1015,9 @@ def _import_project(filename, user, org_id): db_project = project_importer.import_project() return db_project.id -def create_backup(db_instance, Exporter, output_path, logger, cache_ttl): +def create_backup(db_instance: models.Project | models.Task, Exporter, output_path, logger, cache_ttl): try: - cache_dir = get_export_cache_dir(db_instance) + cache_dir = db_instance.get_export_cache_directory() output_path = os.path.join(cache_dir, output_path) instance_time = timezone.localtime(db_instance.updated_date).timestamp() diff --git a/cvat/apps/engine/management/commands/syncperiodicjobs.py b/cvat/apps/engine/management/commands/syncperiodicjobs.py index 097f468b337f..ecabcc11e19f 100644 --- a/cvat/apps/engine/management/commands/syncperiodicjobs.py +++ b/cvat/apps/engine/management/commands/syncperiodicjobs.py @@ -71,6 +71,8 @@ def handle(self, *args, **options): cron_string=job_definition['cron_string'], func=job_definition['func'], id=job_id, + args=job_definition.get('args'), + kwargs=job_definition.get('kwargs'), ) queue.connection.sadd(periodic_jobs_key, job_id) diff --git a/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py b/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py new file mode 100644 index 000000000000..9468ae3f2768 --- /dev/null +++ b/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.2.15 on 2024-12-09 16:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("engine", "0086_profile_has_analytics_access"), + ] + + operations = [ + migrations.AddField( + model_name="job", + name="last_export_date", + field=models.DateTimeField(null=True), + ), + migrations.AddField( + model_name="project", + name="last_export_date", + field=models.DateTimeField(null=True), + ), + migrations.AddField( + model_name="task", + name="last_export_date", + field=models.DateTimeField(null=True), + ), + ] diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 6212ce3a8bc0..b81ad69d5859 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -10,6 +10,7 @@ import re import shutil import uuid +from abc import ABCMeta, abstractmethod from enum import Enum from functools import cached_property from typing import Any, ClassVar, Collection, Dict, Optional @@ -21,7 +22,9 @@ from django.db import IntegrityError, models, transaction from django.db.models import Q, TextChoices from django.db.models.fields import FloatField +from django.db.models.base import ModelBase from django.utils.translation import gettext_lazy as _ +from django.utils import timezone from drf_spectacular.types import OpenApiTypes from drf_spectacular.utils import extend_schema_field @@ -424,6 +427,30 @@ class Meta: def touch(self) -> None: self.save(update_fields=["updated_date"]) +class ABCModelMeta(ABCMeta, ModelBase): + pass + +class _FileSystemRelatedModel(models.Model, metaclass=ABCModelMeta): + class Meta: + abstract = True + + @abstractmethod + def get_dirname(self) -> str: + ... + + def get_tmp_dirname(self) -> str: + return os.path.join(self.get_dirname(), "tmp") + + def get_export_cache_directory(self) -> str: + base_dir = os.path.abspath(self.get_dirname()) + + if os.path.isdir(base_dir): + return os.path.join(base_dir, settings.EXPORT_CACHE_DIR_NAME) + + raise FileNotFoundError( + '{self.__class__.__name__}: dir {base_dir} does not exist' + ) + @transaction.atomic(savepoint=False) def clear_annotations_in_jobs(job_ids): for job_ids_chunk in chunked_list(job_ids, chunk_size=1000): @@ -436,7 +463,7 @@ def clear_annotations_in_jobs(job_ids): LabeledImageAttributeVal.objects.filter(image__job_id__in=job_ids_chunk).delete() LabeledImage.objects.filter(job_id__in=job_ids_chunk).delete() -class Project(TimestampedModel): +class Project(TimestampedModel, _FileSystemRelatedModel): name = SafeCharField(max_length=256) owner = models.ForeignKey(User, null=True, blank=True, on_delete=models.SET_NULL, related_name="+") @@ -454,6 +481,12 @@ class Project(TimestampedModel): target_storage = models.ForeignKey('Storage', null=True, default=None, blank=True, on_delete=models.SET_NULL, related_name='+') + last_export_date = models.DateTimeField(null=True) + + def touch_last_export_date(self): + self.last_export_date = timezone.now() + self.save(update_fields=["last_export_date"]) + def get_labels(self, prefetch=False): queryset = self.label_set.filter(parent__isnull=True).select_related('skeleton') return queryset.prefetch_related( @@ -463,9 +496,6 @@ def get_labels(self, prefetch=False): def get_dirname(self): return os.path.join(settings.PROJECTS_ROOT, str(self.id)) - def get_tmp_dirname(self): - return os.path.join(self.get_dirname(), "tmp") - def is_job_staff(self, user_id): if self.owner == user_id: return True @@ -514,7 +544,7 @@ def with_job_summary(self): ) ) -class Task(TimestampedModel): +class Task(TimestampedModel, _FileSystemRelatedModel): objects = TaskQuerySet.as_manager() project = models.ForeignKey(Project, on_delete=models.CASCADE, @@ -543,6 +573,7 @@ class Task(TimestampedModel): blank=True, on_delete=models.SET_NULL, related_name='+') target_storage = models.ForeignKey('Storage', null=True, default=None, blank=True, on_delete=models.SET_NULL, related_name='+') + last_export_date = models.DateTimeField(null=True) segment_set: models.manager.RelatedManager[Segment] @@ -550,6 +581,10 @@ class Task(TimestampedModel): class Meta: default_permissions = () + def touch_last_export_date(self): + self.last_export_date = timezone.now() + self.save(update_fields=["last_export_date"]) + def get_labels(self, prefetch=False): project = self.project if project: @@ -560,12 +595,9 @@ def get_labels(self, prefetch=False): 'attributespec_set', 'sublabels__attributespec_set', ) if prefetch else queryset - def get_dirname(self): + def get_dirname(self) -> str: return os.path.join(settings.TASKS_ROOT, str(self.id)) - def get_tmp_dirname(self): - return os.path.join(self.get_dirname(), "tmp") - def is_job_staff(self, user_id): if self.owner == user_id: return True @@ -808,7 +840,7 @@ def _validate_constraints(self, obj: Dict[str, Any]): -class Job(TimestampedModel): +class Job(TimestampedModel, _FileSystemRelatedModel): objects = JobQuerySet.as_manager() segment = models.ForeignKey(Segment, on_delete=models.CASCADE) @@ -826,9 +858,13 @@ class Job(TimestampedModel): default=StageChoice.ANNOTATION) state = models.CharField(max_length=32, choices=StateChoice.choices(), default=StateChoice.NEW) - type = models.CharField(max_length=32, choices=JobType.choices(), default=JobType.ANNOTATION) + last_export_date = models.DateTimeField(null=True) + + def touch_last_export_date(self): + self.last_export_date = timezone.now() + self.save(update_fields=["last_export_date"]) def get_target_storage(self) -> Optional[Storage]: return self.segment.task.target_storage @@ -839,9 +875,6 @@ def get_source_storage(self) -> Optional[Storage]: def get_dirname(self): return os.path.join(settings.JOBS_ROOT, str(self.id)) - def get_tmp_dirname(self): - return os.path.join(self.get_dirname(), 'tmp') - @extend_schema_field(OpenApiTypes.INT) def get_project_id(self): project = self.segment.task.project diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 0f6147dc4bf0..04a490817e07 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -353,6 +353,18 @@ class CVAT_QUEUES(Enum): 'func': 'cvat.apps.iam.utils.clean_up_sessions', 'cron_string': '0 0 * * *', }, + *( + { + 'queue': CVAT_QUEUES.CLEANING.value, + 'id': f'clear_{model.lower()}_export_cache', + 'func': 'cvat.apps.dataset_manager.views.cron_job_to_clear_export_cache', + # Run once a day at midnight + 'cron_string': '0 0 * * *', + # 'cron_string': '05 17 * * *', + 'args': (f'cvat.apps.engine.models.{model.title()}',), + } + for model in ('project', 'task', 'job') + ), ] # JavaScript and CSS compression From 92f9d0b5b8e3422c47c98f710dddf4e9b03963f6 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 10 Dec 2024 16:55:13 +0100 Subject: [PATCH 02/61] Refactor a bit --- cvat/apps/dataset_manager/default_settings.py | 2 +- cvat/apps/dataset_manager/util.py | 2 - cvat/apps/dataset_manager/views.py | 37 ++++++++++------ cvat/apps/engine/background.py | 1 - cvat/apps/engine/backup.py | 3 +- cvat/apps/engine/models.py | 44 ++++++++----------- cvat/settings/base.py | 4 +- 7 files changed, 47 insertions(+), 46 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index e313de3d65ac..b7d5757986ca 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -13,4 +13,4 @@ DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" -EXPORT_CACHE_DIR_NAME = "export_cache" \ No newline at end of file +EXPORT_CACHE_DIR_NAME = "export_cache" diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 42b514f9764b..0f117ddc1e3e 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -23,8 +23,6 @@ from django.db import models from pottery import Redlock -from cvat.apps.engine.models import Job, Project, Task - def current_function_name(depth=1): return inspect.getouterframes(inspect.currentframe())[depth].function diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index a1b38b5af58f..69a7cbccb0dd 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -9,6 +9,7 @@ import tempfile from datetime import timedelta +import importlib import django_rq import rq from django.conf import settings @@ -23,6 +24,7 @@ from cvat.apps.engine.models import Job, Project, Task from cvat.apps.engine.utils import get_rq_lock_by_user +from django.db.models import QuerySet from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( LockNotAvailableError, @@ -113,7 +115,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No db_instance = Job.objects.get(pk=job_id) cache_ttl = get_export_cache_ttl(db_instance) - cache_dir = db_instance.get_export_cache_directory() + cache_dir = db_instance.get_export_cache_directory(create=True) # As we're not locking the db object here, it can be updated by the time of actual export. # The file will be saved with the older timestamp. @@ -239,34 +241,43 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: raise -def cron_job_to_clear_export_cache(Model: str) -> None: - import importlib - assert isinstance(Model, str) +def cron_export_cache_cleanup(path_to_model: str) -> None: + assert isinstance(path_to_model, str) - module_name, Model = Model.rsplit('.', 1) + started_at = timezone.now() + module_name, model_name = path_to_model.rsplit('.', 1) module = importlib.import_module(module_name) - Model = getattr(module, Model) + ModelClass = getattr(module, model_name) + assert ModelClass in (Project, Task, Job) logger = ServerLogManager(__name__).glob one_month_ago = timezone.now() - timedelta(days=30) - queryset = Model.objects.filter(last_export_date__gte=one_month_ago) + queryset: QuerySet[Project | Task | Job] = ModelClass.objects.filter(last_export_date__gte=one_month_ago) - for instance in queryset: - try: - export_cache_dir_path = Path(instance.get_export_cache_directory()) - except FileNotFoundError as ex: - logger.warning(str(ex)) + for instance in queryset.iterator(): + instance_dir_path = Path(instance.get_dirname()) + export_cache_dir_path = Path(instance.get_export_cache_directory()) + + if not export_cache_dir_path.exists(): + logger.debug(f"The {export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping...") continue for child in export_cache_dir_path.iterdir(): if not child.is_file(): - logger.exception(f'Unexpected file found in export cache: {child.name}') + logger.warning(f'Unexpected file found in export cache: {child.relative_to(instance_dir_path)}') continue with suppress(Exception): clear_export_cache(child, logger) + finished_at = timezone.now() + logger.info( + f"Clearing the {model_name}'s export cache has been successfully " + f"completed after {(finished_at - started_at).total_seconds()} seconds..." + ) + + def get_export_formats(): return list(EXPORT_FORMATS.values()) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index e3b7c195fbe8..a76ba8646c84 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -463,7 +463,6 @@ def setup_background_job( db_storage = None result_url = self.make_result_url() - # TODO: move into worker? self.db_instance.touch_last_export_date() with get_rq_lock_by_user(queue, user_id): diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 1e73424fbd11..dd9144f2db72 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1017,13 +1017,12 @@ def _import_project(filename, user, org_id): def create_backup(db_instance: models.Project | models.Task, Exporter, output_path, logger, cache_ttl): try: - cache_dir = db_instance.get_export_cache_directory() + cache_dir = db_instance.get_export_cache_directory(create=True) output_path = os.path.join(cache_dir, output_path) instance_time = timezone.localtime(db_instance.updated_date).timestamp() if not (os.path.exists(output_path) and \ instance_time <= os.path.getmtime(output_path)): - os.makedirs(cache_dir, exist_ok=True) with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: temp_file = os.path.join(temp_dir, 'dump') exporter = Exporter(db_instance.id) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index b81ad69d5859..2ec9c929206f 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -441,15 +441,25 @@ def get_dirname(self) -> str: def get_tmp_dirname(self) -> str: return os.path.join(self.get_dirname(), "tmp") - def get_export_cache_directory(self) -> str: + def get_export_cache_directory(self, create: bool = False) -> str: base_dir = os.path.abspath(self.get_dirname()) + cache_dir = os.path.join(base_dir, settings.EXPORT_CACHE_DIR_NAME) - if os.path.isdir(base_dir): - return os.path.join(base_dir, settings.EXPORT_CACHE_DIR_NAME) + if create: + os.makedirs(cache_dir, exist_ok=True) - raise FileNotFoundError( - '{self.__class__.__name__}: dir {base_dir} does not exist' - ) + return cache_dir + + +class _Exportable(models.Model): + class Meta: + abstract = True + + last_export_date = models.DateTimeField(null=True) + + def touch_last_export_date(self): + self.last_export_date = timezone.now() + self.save(update_fields=["last_export_date"]) @transaction.atomic(savepoint=False) def clear_annotations_in_jobs(job_ids): @@ -463,7 +473,7 @@ def clear_annotations_in_jobs(job_ids): LabeledImageAttributeVal.objects.filter(image__job_id__in=job_ids_chunk).delete() LabeledImage.objects.filter(job_id__in=job_ids_chunk).delete() -class Project(TimestampedModel, _FileSystemRelatedModel): +class Project(TimestampedModel, _FileSystemRelatedModel, _Exportable): name = SafeCharField(max_length=256) owner = models.ForeignKey(User, null=True, blank=True, on_delete=models.SET_NULL, related_name="+") @@ -481,12 +491,6 @@ class Project(TimestampedModel, _FileSystemRelatedModel): target_storage = models.ForeignKey('Storage', null=True, default=None, blank=True, on_delete=models.SET_NULL, related_name='+') - last_export_date = models.DateTimeField(null=True) - - def touch_last_export_date(self): - self.last_export_date = timezone.now() - self.save(update_fields=["last_export_date"]) - def get_labels(self, prefetch=False): queryset = self.label_set.filter(parent__isnull=True).select_related('skeleton') return queryset.prefetch_related( @@ -544,7 +548,7 @@ def with_job_summary(self): ) ) -class Task(TimestampedModel, _FileSystemRelatedModel): +class Task(TimestampedModel, _FileSystemRelatedModel, _Exportable): objects = TaskQuerySet.as_manager() project = models.ForeignKey(Project, on_delete=models.CASCADE, @@ -573,7 +577,6 @@ class Task(TimestampedModel, _FileSystemRelatedModel): blank=True, on_delete=models.SET_NULL, related_name='+') target_storage = models.ForeignKey('Storage', null=True, default=None, blank=True, on_delete=models.SET_NULL, related_name='+') - last_export_date = models.DateTimeField(null=True) segment_set: models.manager.RelatedManager[Segment] @@ -581,10 +584,6 @@ class Task(TimestampedModel, _FileSystemRelatedModel): class Meta: default_permissions = () - def touch_last_export_date(self): - self.last_export_date = timezone.now() - self.save(update_fields=["last_export_date"]) - def get_labels(self, prefetch=False): project = self.project if project: @@ -840,7 +839,7 @@ def _validate_constraints(self, obj: Dict[str, Any]): -class Job(TimestampedModel, _FileSystemRelatedModel): +class Job(TimestampedModel, _FileSystemRelatedModel, _Exportable): objects = JobQuerySet.as_manager() segment = models.ForeignKey(Segment, on_delete=models.CASCADE) @@ -860,11 +859,6 @@ class Job(TimestampedModel, _FileSystemRelatedModel): default=StateChoice.NEW) type = models.CharField(max_length=32, choices=JobType.choices(), default=JobType.ANNOTATION) - last_export_date = models.DateTimeField(null=True) - - def touch_last_export_date(self): - self.last_export_date = timezone.now() - self.save(update_fields=["last_export_date"]) def get_target_storage(self) -> Optional[Storage]: return self.segment.task.target_storage diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 04a490817e07..84fd487d2dfc 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -356,8 +356,8 @@ class CVAT_QUEUES(Enum): *( { 'queue': CVAT_QUEUES.CLEANING.value, - 'id': f'clear_{model.lower()}_export_cache', - 'func': 'cvat.apps.dataset_manager.views.cron_job_to_clear_export_cache', + 'id': f'cron_{model.lower()}_export_cache_cleanup', + 'func': 'cvat.apps.dataset_manager.views.cron_export_cache_cleanup', # Run once a day at midnight 'cron_string': '0 0 * * *', # 'cron_string': '05 17 * * *', From e4c24ea396cb6ea0da781b4af3dc20b1fe2f0fe1 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 11 Dec 2024 11:52:07 +0100 Subject: [PATCH 03/61] Remove outdated code --- .../tests/test_rest_api_formats.py | 77 +++---------------- cvat/apps/dataset_manager/views.py | 23 +----- 2 files changed, 14 insertions(+), 86 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 059a45f6df2d..f50c1a00d694 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1482,7 +1482,7 @@ def patched_log_exception(logger=None, exc_info=True): mock_os_replace.assert_called_once() - def _clear(*_, file_path: str, file_ctime: str): + def _clear(*_, file_path: str): from os import remove as original_remove from cvat.apps.dataset_manager.util import LockNotAvailableError @@ -1508,7 +1508,7 @@ def _clear(*_, file_path: str, file_ctime: str): exited_by_timeout = False try: clear_export_cache( - file_path=file_path, file_ctime=file_ctime, logger=MagicMock() + file_path=file_path, logger=MagicMock() ) except LockNotAvailableError: # should come from waiting for get_export_cache_lock @@ -1546,8 +1546,6 @@ def _clear(*_, file_path: str, file_ctime: str): first_export_path = export(dst_format=format_name, task_id=task_id) - export_instance_timestamp = parse_export_file_path(first_export_path).instance_timestamp - self._create_annotations(task, f'{format_name} many jobs', "default") processes_finished_correctly = False @@ -1570,7 +1568,7 @@ def _clear(*_, file_path: str, file_ctime: str): export_checked_the_file, export_created_the_file, export_file_path, clear_removed_the_file, ), - kwargs=dict(file_path=first_export_path, file_ctime=export_instance_timestamp), + kwargs=dict(file_path=first_export_path), ))) export_process.start() @@ -1689,7 +1687,7 @@ def patched_osp_exists(path: str): mock_osp_exists.assert_called() - def _clear(*_, file_path: str, file_ctime: str): + def _clear(*_, file_path: str): from os import remove as original_remove from cvat.apps.dataset_manager.util import LockNotAvailableError @@ -1714,7 +1712,7 @@ def _clear(*_, file_path: str, file_ctime: str): exited_by_timeout = False try: clear_export_cache( - file_path=file_path, file_ctime=file_ctime, logger=MagicMock() + file_path=file_path, logger=MagicMock() ) except LockNotAvailableError: # should come from waiting for get_export_cache_lock @@ -1755,8 +1753,6 @@ def patched_export(*args, **kwargs): response = self._get_request_with_data(download_url, download_params, self.admin) self.assertEqual(response.status_code, status.HTTP_201_CREATED) - export_instance_time = parse_export_file_path(export_path).instance_timestamp - download_params["action"] = "download" processes_finished_correctly = False @@ -1771,7 +1767,7 @@ def patched_export(*args, **kwargs): clear_process = es.enter_context(process_closing(multiprocessing.Process( target=_clear, args=(download_checked_the_file, clear_removed_the_file, export_cache_lock), - kwargs=dict(file_path=export_path, file_ctime=export_instance_time), + kwargs=dict(file_path=export_path), ))) download_process.start() @@ -1809,7 +1805,7 @@ def patched_export(*args, **kwargs): self.assertFalse(clear_removed_the_file.get()) - def test_export_can_create_file_and_cleanup_job(self): + def test_export_can_create_file(self): format_name = "CVAT for images 1.1" images = self._generate_task_images(3) task = self._create_task(tasks["main"], images) @@ -1818,19 +1814,14 @@ def test_export_can_create_file_and_cleanup_job(self): with ( patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler') as mock_rq_get_scheduler, patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job - mock_rq_scheduler = MagicMock() - mock_rq_get_scheduler.return_value = mock_rq_scheduler - export_path = export(dst_format=format_name, task_id=task_id) self.assertTrue(osp.isfile(export_path)) - mock_rq_scheduler.enqueue_in.assert_called_once() def test_export_cache_lock_can_raise_on_releasing_expired_lock(self): from pottery import ReleaseUnlockedLock @@ -1918,8 +1909,7 @@ def test_cleanup_can_remove_file(self): mock_rq_get_current_job.return_value = MagicMock(timeout=5) export_path = export(dst_format=format_name, task_id=task_id) - file_ctime = parse_export_file_path(export_path).instance_timestamp - clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + clear_export_cache(file_path=export_path, logger=MagicMock()) self.assertFalse(osp.isfile(export_path)) @@ -1951,8 +1941,7 @@ def test_cleanup_can_request_retry_on_locking_failure(self): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job - file_ctime = parse_export_file_path(export_path).instance_timestamp - clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + clear_export_cache(file_path=export_path, logger=MagicMock()) mock_get_export_cache_lock.assert_called() self.assertEqual(mock_rq_job.retries_left, 1) @@ -1967,7 +1956,7 @@ def test_cleanup_can_fail_if_no_file(self): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job - clear_export_cache(file_path="non existent file path", file_ctime=0, logger=MagicMock()) + clear_export_cache(file_path="non existent file path", logger=MagicMock()) def test_cleanup_can_defer_removal_if_file_is_used_recently(self): format_name = "CVAT for images 1.1" @@ -1994,55 +1983,11 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): mock_rq_get_current_job.return_value = mock_rq_job export_path = export(dst_format=format_name, task_id=task_id) - file_ctime = parse_export_file_path(export_path).instance_timestamp - clear_export_cache(file_path=export_path, file_ctime=file_ctime, logger=MagicMock()) + clear_export_cache(file_path=export_path, logger=MagicMock()) self.assertEqual(mock_rq_job.retries_left, 1) self.assertTrue(osp.isfile(export_path)) - def test_cleanup_can_be_called_with_old_signature_and_values(self): - # Test RQ jobs for backward compatibility of API prior to the PR - # https://github.com/cvat-ai/cvat/pull/7864 - # Jobs referring to the old API can exist in the redis queues after the server is updated - - format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") - task_id = task["id"] - - with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - new_export_path = export(dst_format=format_name, task_id=task_id) - - file_ctime = parse_export_file_path(new_export_path).instance_timestamp - - old_export_path = osp.join( - osp.dirname(new_export_path), "annotations_cvat-for-images-11.ZIP" - ) - shutil.move(new_export_path, old_export_path) - - old_kwargs = { - 'file_path': old_export_path, - 'file_ctime': file_ctime, - 'logger': MagicMock(), - } - - with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - clear_export_cache(**old_kwargs) - - self.assertFalse(osp.isfile(old_export_path)) - - class ProjectDumpUpload(_DbTestBase): def _get_download_project_dataset_response(self, url, user, dump_format_name, edata): data = { diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 69a7cbccb0dd..e4a267d8738f 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -148,27 +148,10 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No server_url=server_url, save_images=save_images) os.replace(temp_file, output_path) - scheduler: Scheduler = django_rq.get_scheduler( - settings.CVAT_QUEUES.EXPORT_DATA.value - ) - cleaning_job = scheduler.enqueue_in( - time_delta=cache_ttl, - func=clear_export_cache, - file_path=output_path, - file_ctime=instance_update_time.timestamp(), - logger=logger - ) logger.info( - "The {} '{}' is exported as '{}' at '{}' " - "and available for downloading for the next {}. " - "Export cache cleaning job is enqueued, id '{}'".format( - db_instance.__class__.__name__.lower(), - db_instance.name if isinstance( - db_instance, (Project, Task) - ) else db_instance.id, - dst_format, output_path, cache_ttl, - cleaning_job.id - ) + f"The {db_instance.__class__.__name__.lower()} '{db_instance.id}' is exported " + f"as {dst_format!r} at {output_path!r} and available for downloading for the next " + f"{cache_ttl.total_seconds()} seconds. " ) return output_path From 341299364a8a026a5aaa74e9e393003aee25d32b Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 11 Dec 2024 12:13:38 +0100 Subject: [PATCH 04/61] Add todo comments --- cvat/apps/dataset_manager/views.py | 10 ++++++---- cvat/apps/engine/backup.py | 2 ++ cvat/apps/events/export.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index e4a267d8738f..84ac3e44e115 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -203,6 +203,7 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: if not osp.exists(file_path): logger.error("Export cache file '{}' doesn't exist".format(file_path)) + # TODO: update for backups parsed_filename = parse_export_file_path(file_path) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) @@ -213,17 +214,17 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: raise FileIsBeingUsedError os.remove(file_path) - logger.debug("Export cache file '{}' successfully removed".format(file_path)) + logger.debug(f"Export cache file {file_path!r} successfully removed") except LockNotAvailableError: logger.info( - "Failed to acquire export cache lock for the file: {file_path}." + f"Failed to acquire export cache lock for the file: {file_path}." ) raise except Exception: log_exception(logger) raise - +# todo: move into engine def cron_export_cache_cleanup(path_to_model: str) -> None: assert isinstance(path_to_model, str) @@ -247,8 +248,9 @@ def cron_export_cache_cleanup(path_to_model: str) -> None: continue for child in export_cache_dir_path.iterdir(): + # export cache dir may contain temporary directories if not child.is_file(): - logger.warning(f'Unexpected file found in export cache: {child.relative_to(instance_dir_path)}') + logger.warning(f'The {child.relative_to(instance_dir_path)} is not a file, skipping...') continue with suppress(Exception): diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index dd9144f2db72..41ff59e52543 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1029,6 +1029,7 @@ def create_backup(db_instance: models.Project | models.Task, Exporter, output_pa exporter.export_to(temp_file) os.replace(temp_file, output_path) + # TODO: move into cron job archive_ctime = os.path.getctime(output_path) scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.IMPORT_DATA.value) cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, @@ -1192,6 +1193,7 @@ def import_task(request, queue_name, filename=None): filename=filename ) +# TODO: delete function def _clear_export_cache(file_path: str, file_ctime: float, logger: Logger) -> None: try: if os.path.exists(file_path) and os.path.getctime(file_path) == file_ctime: diff --git a/cvat/apps/events/export.py b/cvat/apps/events/export.py index 9225f1141162..3955337b6f16 100644 --- a/cvat/apps/events/export.py +++ b/cvat/apps/events/export.py @@ -72,6 +72,7 @@ def _create_csv(query_params, output_filename, cache_ttl): writer.writerows(result.result_rows) archive_ctime = os.path.getctime(output_filename) + # TODO: scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, func=_clear_export_cache, From 2cc395c6e61567495ebf5c009a90e0c01d4dfa63 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 17 Dec 2024 13:19:43 +0100 Subject: [PATCH 05/61] [Backups] clean up export cache from a cron job && use locks duiring working with a file --- .../tests/test_rest_api_formats.py | 2 +- cvat/apps/dataset_manager/util.py | 196 ++++++++++++------ cvat/apps/dataset_manager/views.py | 17 +- cvat/apps/engine/background.py | 56 +++-- cvat/apps/engine/backup.py | 94 ++++++--- 5 files changed, 237 insertions(+), 128 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index f50c1a00d694..92aee78db014 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -36,7 +36,7 @@ from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.tests.utils import TestDir from cvat.apps.dataset_manager.util import get_export_cache_lock -from cvat.apps.dataset_manager.views import clear_export_cache, export, parse_export_file_path +from cvat.apps.dataset_manager.views import clear_export_cache, export from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 0f117ddc1e3e..015b6ffd7a7a 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -12,6 +12,7 @@ from contextlib import contextmanager from copy import deepcopy from datetime import timedelta +from enum import Enum from threading import Lock from typing import Any, Optional @@ -39,7 +40,7 @@ def make_zip_archive(src_path, dst_path): def bulk_create(db_model, objects, flt_param): if objects: if flt_param: - if 'postgresql' in settings.DATABASES["default"]["ENGINE"]: + if "postgresql" in settings.DATABASES["default"]["ENGINE"]: return db_model.objects.bulk_create(objects) else: ids = list(db_model.objects.filter(**flt_param).values_list('id', flat=True)) @@ -51,9 +52,11 @@ def bulk_create(db_model, objects, flt_param): return [] + def is_prefetched(queryset: models.QuerySet, field: str) -> bool: return field in queryset._prefetch_related_lookups + def add_prefetch_fields(queryset: models.QuerySet, fields: Sequence[str]) -> models.QuerySet: for field in fields: if not is_prefetched(queryset, field): @@ -61,6 +64,7 @@ def add_prefetch_fields(queryset: models.QuerySet, fields: Sequence[str]) -> mod return queryset + def get_cached(queryset: models.QuerySet, pk: int) -> models.Model: """ Like regular queryset.get(), but checks for the cached values first @@ -80,6 +84,7 @@ def get_cached(queryset: models.QuerySet, pk: int) -> models.Model: return result + def faster_deepcopy(v): "A slightly optimized version of the default deepcopy, can be used as a drop-in replacement." # Default deepcopy is very slow, here we do shallow copy for primitive types and containers @@ -144,77 +149,132 @@ def get_export_cache_lock( lock.release() - -def make_export_filename( - dst_dir: str, - save_images: bool, - instance_timestamp: float, - format_name: str, -) -> str: - from .formats.registry import EXPORT_FORMATS - file_ext = EXPORT_FORMATS[format_name].EXT - - filename = '%s-instance%f-%s.%s' % ( - 'dataset' if save_images else 'annotations', - # store the instance timestamp in the file name to reliably get this information - # ctime / mtime do not return file creation time on linux - # mtime is used for file usage checks - instance_timestamp, - make_file_name(to_snake_case(format_name)), - file_ext, - ) - return osp.join(dst_dir, filename) +class ExportFileType(str, Enum): + ANNOTATIONS = "annotations" + BACKUP = "backup" + DATASET = "dataset" -@attrs.define -class ParsedExportFilename: +@attrs.frozen +class _ParsedExportFilename: + file_type: ExportFileType + file_ext: str instance_type: str - has_images: bool - instance_timestamp: Optional[float] + instance_timestamp: float + + +@attrs.frozen +class ParsedDatasetFilename(_ParsedExportFilename): format_repr: str - file_ext: str -def parse_export_file_path(file_path: os.PathLike[str]) -> ParsedExportFilename: - file_path = osp.normpath(file_path) - dirname, basename = osp.split(file_path) +@attrs.frozen +class ParsedBackupFilename(_ParsedExportFilename): + pass - basename_match = re.fullmatch( - ( - r'(?Pdataset|annotations)' - # optional for backward compatibility - r'(?:-instance(?P\d+\.\d+)-|_)' - r'(?P.+)' - r'\.(?P.+)' - ), - basename - ) - if not basename_match: - raise ValueError(f"Couldn't parse filename components in '{basename}'") - - dirname_match = re.search(rf'/(jobs|tasks|projects)/\d+/{settings.EXPORT_CACHE_DIR_NAME}$', dirname) - if not dirname_match: - raise ValueError(f"Couldn't parse instance type in '{dirname}'") - - match dirname_match.group(1): - case 'jobs': - instance_type_name = 'job' - case 'tasks': - instance_type_name = 'task' - case 'projects': - instance_type_name = 'project' - case _: - assert False - - if instance_timestamp_str := basename_match.groupdict().get('instance_timestamp'): - instance_timestamp = float(instance_timestamp_str) - else: - instance_timestamp = None - - return ParsedExportFilename( - instance_type=instance_type_name, - has_images=basename_match.group('export_mode') == 'dataset', - instance_timestamp=instance_timestamp, - format_repr=basename_match.group('format_tag'), - file_ext=basename_match.group('file_ext'), - ) + +class ExportCacheManager: + # store the instance timestamp in the file name to reliably get this information + # ctime / mtime do not return file creation time on linux + # mtime is used for file usage checks + BASE_FILE_NAME_TEMPLATE = "{file_type}-instance{instance_timestamp}{optional_suffix}.{file_ext}" + + @classmethod + def make_dataset_file_path( + cls, + cache_dir: str, + *, + save_images: bool, + instance_timestamp: float, + format_name: str, + ) -> str: + from .formats.registry import EXPORT_FORMATS + + file_ext = EXPORT_FORMATS[format_name].EXT + + file_type = ExportFileType.DATASET if save_images else ExportFileType.ANNOTATIONS + + normalized_format_name = make_file_name(to_snake_case(format_name)) + filename = cls.BASE_FILE_NAME_TEMPLATE.format_map( + { + "file_type": file_type, + "instance_timestamp": instance_timestamp, + "optional_suffix": "-" + normalized_format_name, + "file_ext": file_ext, + } + ) + + return osp.join(cache_dir, filename) + + @classmethod + def make_backup_file_path( + cls, + cache_dir: str, + *, + instance_timestamp: float, + ) -> str: + filename = cls.BASE_FILE_NAME_TEMPLATE.format_map( + { + "file_type": ExportFileType.BACKUP, + "instance_timestamp": instance_timestamp, + "optional_suffix": "", + "file_ext": "zip", + } + ) + return osp.join(cache_dir, filename) + + @staticmethod + def parse_file_path( + file_path: os.PathLike[str], + ) -> ParsedDatasetFilename | ParsedBackupFilename: + file_path = osp.normpath(file_path) + dirname, basename = osp.split(file_path) + + # handle directory + dirname_match = re.search( + rf"/(jobs|tasks|projects)/\d+/{settings.EXPORT_CACHE_DIR_NAME}$", dirname + ) + if not dirname_match: + raise ValueError(f"Couldn't parse instance type in '{dirname}'") + + instance_type_names = dirname_match.group(1) + assert instance_type_names in {"projects", "tasks", "jobs"} + instance_type_name = instance_type_names[:-1] + + # handle file name + file_type, non_parsed_basename = basename.split("-", maxsplit=1) + file_type = ExportFileType(file_type) + + if file_type in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): + basename_match = re.fullmatch( + ( + # optional for backward compatibility + r"(?:instance(?P\d+\.\d+)-|_)" + r"(?P.+)" # TODO: convert back? + r"\.(?P.+)" + ), + non_parsed_basename, + ) + ParsedFileNameClass = ParsedDatasetFilename + elif file_type == ExportFileType.BACKUP: + basename_match = re.fullmatch( + (r"(?:instance(?P\d+\.\d+)-|_)" r"\.(?P.+)"), + non_parsed_basename, + ) + ParsedFileNameClass = ParsedBackupFilename + else: + raise ValueError(f"Unsupported file type: {file_type!r}") + + if not basename_match: + raise ValueError(f"Couldn't parse filename components in '{basename}'") + + fragments = basename_match.groupdict() + + if fragments.get("instance_timestamp"): + fragments["instance_timestamp"] = float(fragments["instance_timestamp"]) + + return ParsedFileNameClass( + file_type=file_type.value, + instance_type=instance_type_name, + **fragments, + ) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index f09d1c18cb11..892aca678e65 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -30,8 +30,7 @@ from .util import ( LockNotAvailableError, current_function_name, get_export_cache_lock, - make_export_filename, - parse_export_file_path + ExportCacheManager ) slogger = ServerLogManager(__name__) @@ -64,7 +63,7 @@ def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: return TTL_CONSTS[db_instance.lower()] -def _retry_current_rq_job(time_delta: timedelta) -> rq.job.Job: +def retry_current_rq_job(time_delta: timedelta) -> rq.job.Job: # TODO: implement using retries once we move from rq_scheduler to builtin RQ scheduler # for better reliability and error reporting @@ -130,8 +129,9 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No )) instance_update_time = max(tasks_update + [instance_update_time]) - output_path = make_export_filename( - cache_dir, save_images, instance_update_time.timestamp(), dst_format + output_path = ExportCacheManager.make_dataset_file_path( + cache_dir, save_images=save_images, instance_timestamp=instance_update_time.timestamp(), + format_name=dst_format ) os.makedirs(cache_dir, exist_ok=True) @@ -158,7 +158,7 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No return output_path except LockNotAvailableError: # Need to retry later if the lock was not available - _retry_current_rq_job(EXPORT_LOCKED_RETRY_INTERVAL) + retry_current_rq_job(EXPORT_LOCKED_RETRY_INTERVAL) logger.info( "Failed to acquire export cache lock. Retrying in {}".format( EXPORT_LOCKED_RETRY_INTERVAL @@ -204,13 +204,12 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: if not osp.exists(file_path): logger.error("Export cache file '{}' doesn't exist".format(file_path)) - # TODO: update for backups - parsed_filename = parse_export_file_path(file_path) + parsed_filename = ExportCacheManager.parse_file_path(file_path) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): logger.info( - "Export cache file '{}' is recently accessed".format(file_path) + "Cache file '{}' is recently accessed".format(file_path) ) raise FileIsBeingUsedError diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index a76ba8646c84..56a7d2d4c939 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -463,8 +463,6 @@ def setup_background_job( db_storage = None result_url = self.make_result_url() - self.db_instance.touch_last_export_date() - with get_rq_lock_by_user(queue, user_id): queue.enqueue_call( func=func, @@ -478,6 +476,8 @@ def setup_background_job( failure_ttl=cache_ttl.total_seconds(), ) + self.db_instance.touch_last_export_date() + def get_v1_endpoint_view_name(self) -> str: """ Get view name of the endpoint for the first API version @@ -531,6 +531,10 @@ def _handle_rq_job_v1( rq_job: Optional[RQJob], queue: DjangoRQ, ) -> Optional[Response]: + + def is_result_outdated() -> bool: + return rq_job.meta[RQJobMetaField.REQUEST]["timestamp"] < last_instance_update_time + last_instance_update_time = timezone.localtime(self.db_instance.updated_date) timestamp = self.get_timestamp(last_instance_update_time) @@ -590,25 +594,34 @@ def _handle_rq_job_v1( status=status.HTTP_500_INTERNAL_SERVER_ERROR, ) - elif not os.path.exists(file_path): - return Response( - "The export result is not found", - status=status.HTTP_500_INTERNAL_SERVER_ERROR, - ) if action == "download": - filename = self.export_args.filename or build_backup_file_name( - class_name=self.resource, - identifier=self.db_instance.name, - timestamp=timestamp, - extension=os.path.splitext(file_path)[1], - ) - - rq_job.delete() - return sendfile( - self.request, file_path, attachment=True, attachment_filename=filename - ) - - return Response(status=status.HTTP_201_CREATED) + # TODO: update after 8721 + with dm.util.get_export_cache_lock(file_path, ttl=55, acquire_timeout=50): + if not os.path.exists(file_path): + return Response( + "The backup file has been expired, please retry backing up", + status=status.HTTP_404_NOT_FOUND, + ) + + filename = self.export_args.filename or build_backup_file_name( + class_name=self.resource, + identifier=self.db_instance.name, + timestamp=timestamp, + extension=os.path.splitext(file_path)[1], + ) + + rq_job.delete() + return sendfile( + self.request, file_path, attachment=True, attachment_filename=filename + ) + # TODO: update after 8721 + with dm.util.get_export_cache_lock(file_path, ttl=55, acquire_timeout=50): + if osp.exists(file_path) and not is_result_outdated(): + # extend_export_file_lifetime(file_path) + return Response(status=status.HTTP_201_CREATED) + + cancel_and_delete(rq_job) + return None else: raise NotImplementedError( f"Export to {self.export_args.location} location is not implemented yet" @@ -685,7 +698,6 @@ def setup_background_job( func_args = ( self.db_instance, Exporter, - "{}_backup.zip".format(self.resource), logger, cache_ttl, ) @@ -739,6 +751,8 @@ def setup_background_job( failure_ttl=cache_ttl.total_seconds(), ) + self.db_instance.touch_last_export_date() + def get_v1_endpoint_view_name(self) -> str: """Get view name of the endpoint for the first API version""" diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 41ff59e52543..9089bdba5d62 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -10,11 +10,14 @@ import shutil import tempfile import uuid +from abc import ABCMeta, abstractmethod from enum import Enum from logging import Logger from tempfile import NamedTemporaryFile -from typing import Any, Collection, Dict, Iterable, Optional, Union +from typing import Any, Collection, Dict, Iterable, Optional, Union, Type from zipfile import ZipFile +import logging +from datetime import timedelta import django_rq from django.conf import settings @@ -28,6 +31,8 @@ from rest_framework.exceptions import ValidationError import cvat.apps.dataset_manager as dm +from cvat.apps.dataset_manager.util import ExportCacheManager, get_export_cache_lock, LockNotAvailableError +from cvat.apps.dataset_manager.views import EXPORT_LOCKED_RETRY_INTERVAL, retry_current_rq_job from cvat.apps.engine import models from cvat.apps.engine.log import ServerLogManager from cvat.apps.engine.serializers import (AttributeSerializer, DataSerializer, JobWriteSerializer, @@ -307,7 +312,7 @@ def _get_db_jobs(self): return db_jobs return () -class _ExporterBase(): +class _ExporterBase(metaclass=ABCMeta): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -335,6 +340,10 @@ def _write_directory(self, source_dir, zip_object, target_dir, recursive=True, e target_dir=target_dir, ) + @abstractmethod + def export_to(self, file: str | ZipFile, **kwargs): + ... + class TaskExporter(_ExporterBase, _TaskBackupBase): def __init__(self, pk, version=Version.V1): super().__init__(logger=slogger.task[pk]) @@ -521,7 +530,7 @@ def _export_task(self, zip_obj, target_dir=None): self._write_annotations(zip_obj, target_dir) self._write_annotation_guide(zip_obj, target_dir) - def export_to(self, file, target_dir=None): + def export_to(self, file: str | ZipFile, target_dir: str | None = None): if self._db_task.data.storage_method == StorageMethodChoice.FILE_SYSTEM and \ self._db_task.data.storage == StorageChoice.SHARE: raise Exception('The task cannot be exported because it does not contain any raw data') @@ -933,8 +942,8 @@ def serialize_project(): zip_object.writestr(self.MANIFEST_FILENAME, data=JSONRenderer().render(project)) - def export_to(self, filename): - with ZipFile(filename, 'w') as output_file: + def export_to(self, file: str): + with ZipFile(file, 'w') as output_file: self._write_annotation_guide(output_file) self._write_manifest(output_file) self._write_tasks(output_file) @@ -1015,37 +1024,64 @@ def _import_project(filename, user, org_id): db_project = project_importer.import_project() return db_project.id -def create_backup(db_instance: models.Project | models.Task, Exporter, output_path, logger, cache_ttl): + +def create_backup( + # FUTURE-FIXME: there db_instance_id should be passed + db_instance: models.Project | models.Task, + Exporter: Type[ProjectExporter | TaskExporter], + logger: logging.Logger, + cache_ttl: timedelta, +): try: cache_dir = db_instance.get_export_cache_directory(create=True) - output_path = os.path.join(cache_dir, output_path) - - instance_time = timezone.localtime(db_instance.updated_date).timestamp() - if not (os.path.exists(output_path) and \ - instance_time <= os.path.getmtime(output_path)): - with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: - temp_file = os.path.join(temp_dir, 'dump') - exporter = Exporter(db_instance.id) - exporter.export_to(temp_file) + db_instance.refresh_from_db(fields=['updated_date']) + instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() + + output_path = ExportCacheManager.make_backup_file_path(cache_dir, instance_timestamp=instance_timestamp) + + with get_export_cache_lock( + output_path, + block=True, + # TODO: update after merging #8721 (DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT, DATASET_EXPORT_LOCK_TTL) + acquire_timeout=60, + ttl=30, + ): + # output_path includes timestamp of the last update + if os.path.exists(output_path): + # TODO: update after merging #8721 + # extend_export_file_lifetime(output_path) + return output_path + + with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: + temp_file = os.path.join(temp_dir, 'dump') + exporter = Exporter(db_instance.id) + exporter.export_to(temp_file) + + with get_export_cache_lock( + output_path, + block=True, + # TODO: update after merging #8721 (DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT, DATASET_EXPORT_LOCK_TTL) + acquire_timeout=60, + ttl=30, + ): os.replace(temp_file, output_path) - # TODO: move into cron job - archive_ctime = os.path.getctime(output_path) - scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.IMPORT_DATA.value) - cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, - func=_clear_export_cache, - file_path=output_path, - file_ctime=archive_ctime, - logger=logger) logger.info( - "The {} '{}' is backuped at '{}' " - "and available for downloading for the next {}. " - "Export cache cleaning job is enqueued, id '{}'".format( - "project" if isinstance(db_instance, Project) else 'task', - db_instance.name, output_path, cache_ttl, - cleaning_job.id)) + f"The {db_instance.__class__.__name__.lower()} '{db_instance.id}' is backed up at {output_path!r} " + f"and available for downloading for the next {cache_ttl}." + ) return output_path + except LockNotAvailableError: + # Need to retry later if the lock was not available + retry_current_rq_job(EXPORT_LOCKED_RETRY_INTERVAL) + logger.info( + "Failed to acquire export cache lock. Retrying in {}".format( + EXPORT_LOCKED_RETRY_INTERVAL + ) + ) + raise + except Exception: log_exception(logger) raise From 028313fdd50c8ad929da9e48a299a7c25302f4c3 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 17 Dec 2024 13:23:46 +0100 Subject: [PATCH 06/61] Fix pylint issues --- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 1 - cvat/apps/engine/backup.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 92aee78db014..bf09578acd1f 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -12,7 +12,6 @@ import av import numpy as np import random -import shutil import xml.etree.ElementTree as ET import zipfile from contextlib import ExitStack, contextmanager diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 9089bdba5d62..b6ee6830652f 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -47,7 +47,7 @@ ) from cvat.apps.engine.rq_job_handler import RQId, RQJobMetaField from cvat.apps.engine.models import ( - StorageChoice, StorageMethodChoice, DataChoice, Project, Location, + StorageChoice, StorageMethodChoice, DataChoice, Location, RequestAction, RequestTarget, RequestSubresource, ) from cvat.apps.engine.task import JobFileMapping, _create_thread @@ -341,7 +341,7 @@ def _write_directory(self, source_dir, zip_object, target_dir, recursive=True, e ) @abstractmethod - def export_to(self, file: str | ZipFile, **kwargs): + def export_to(self, file: str | ZipFile, target_dir: str | None = None): ... class TaskExporter(_ExporterBase, _TaskBackupBase): @@ -942,7 +942,7 @@ def serialize_project(): zip_object.writestr(self.MANIFEST_FILENAME, data=JSONRenderer().render(project)) - def export_to(self, file: str): + def export_to(self, file: str, target_dir: str | None = None): with ZipFile(file, 'w') as output_file: self._write_annotation_guide(output_file) self._write_manifest(output_file) From 5292d2902bfcfab4746408ca2b8b855d9feec111 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 17 Dec 2024 13:57:53 +0100 Subject: [PATCH 07/61] Remove todo --- cvat/apps/events/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cvat/apps/events/export.py b/cvat/apps/events/export.py index 3955337b6f16..9225f1141162 100644 --- a/cvat/apps/events/export.py +++ b/cvat/apps/events/export.py @@ -72,7 +72,6 @@ def _create_csv(query_params, output_filename, cache_ttl): writer.writerows(result.result_rows) archive_ctime = os.path.getctime(output_filename) - # TODO: scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) cleaning_job = scheduler.enqueue_in(time_delta=cache_ttl, func=_clear_export_cache, From b18a008aa6b0cd2d614e78ee437810f57edba99c Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 17 Dec 2024 13:58:12 +0100 Subject: [PATCH 08/61] typo --- cvat/apps/dataset_manager/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 015b6ffd7a7a..b7a11a0bed63 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -258,7 +258,7 @@ def parse_file_path( ParsedFileNameClass = ParsedDatasetFilename elif file_type == ExportFileType.BACKUP: basename_match = re.fullmatch( - (r"(?:instance(?P\d+\.\d+)-|_)" r"\.(?P.+)"), + r"(?:instance(?P\d+\.\d+)-|_)" r"\.(?P.+)", non_parsed_basename, ) ParsedFileNameClass = ParsedBackupFilename From 8fec24a404b1874b229e0ffaf904c7716bd97c4e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 11:35:27 +0100 Subject: [PATCH 09/61] Update tests --- .../tests/test_rest_api_formats.py | 112 ++---------------- cvat/apps/dataset_manager/views.py | 3 +- 2 files changed, 11 insertions(+), 104 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 63b8a435e48a..47437a1a6428 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1510,16 +1510,11 @@ def _clear(*_, file_path: str): patch( "cvat.apps.dataset_manager.views.os.remove" ) as mock_os_remove, - patch( - "cvat.apps.dataset_manager.views.rq.get_current_job" - ) as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), patch( "cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": export_outdated_after}, ), ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) mock_os_remove.side_effect = chain_side_effects( original_remove, side_effect(set_condition, clear_removed_the_file), @@ -1560,20 +1555,13 @@ def _clear(*_, file_path: str): task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - ): - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job - - # create a file in the export cache - first_export_path = export(dst_format=format_name, task_id=task_id) + # create a file in the export cache + first_export_path = export(dst_format=format_name, task_id=task_id) - initial_file_modfication_time = os.path.getmtime(first_export_path) - # make sure that a file in the export cache is outdated by timeout - # and a file would have to be deleted if the export was not running in parallel - sleep(export_outdated_after.seconds + 1) + initial_file_modfication_time = os.path.getmtime(first_export_path) + # make sure that a file in the export cache is outdated by timeout + # and a file would have to be deleted if the export was not running in parallel + sleep(export_outdated_after.seconds + 1) processes_finished_correctly = False with ExitStack() as es: @@ -1704,10 +1692,6 @@ def _clear(*_, file_path: str): new=self.patched_get_export_cache_lock, ), patch("cvat.apps.dataset_manager.views.os.remove") as mock_os_remove, - patch( - "cvat.apps.dataset_manager.views.rq.get_current_job" - ) as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), patch( "cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)} ), @@ -1717,8 +1701,6 @@ def _clear(*_, file_path: str): side_effect(set_condition, clear_removed_the_file), ) - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - exited_by_timeout = False try: clear_export_cache( @@ -1828,12 +1810,8 @@ def test_export_can_create_file(self): task_id = task["id"] with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job - export_path = export(dst_format=format_name, task_id=task_id) self.assertTrue(osp.isfile(export_path)) @@ -1875,26 +1853,16 @@ def test_export_can_reuse_older_file_if_still_relevant(self): task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - first_export_path = export(dst_format=format_name, task_id=task_id) + first_export_path = export(dst_format=format_name, task_id=task_id) from os.path import exists as original_exists with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), patch( "cvat.apps.dataset_manager.views.osp_exists", side_effect=original_exists ) as mock_osp_exists, patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - second_export_path = export(dst_format=format_name, task_id=task_id) self.assertEqual(first_export_path, second_export_path) @@ -2059,68 +2027,19 @@ def test_cleanup_can_remove_file(self): task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - export_path = export(dst_format=format_name, task_id=task_id) + export_path = export(dst_format=format_name, task_id=task_id) with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - export_path = export(dst_format=format_name, task_id=task_id) clear_export_cache(file_path=export_path, logger=MagicMock()) self.assertFalse(osp.isfile(export_path)) - def test_cleanup_can_request_retry_on_locking_failure(self): - format_name = "CVAT for images 1.1" - task = self._setup_task_with_annotations(format_name=format_name) - task_id = task["id"] - - from cvat.apps.dataset_manager.util import LockNotAvailableError - - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - export_path = export(dst_format=format_name, task_id=task_id) - - with ( - patch( - "cvat.apps.dataset_manager.views.get_export_cache_lock", - side_effect=LockNotAvailableError, - ) as mock_get_export_cache_lock, - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - self.assertRaises(LockNotAvailableError), - ): - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job - - clear_export_cache(file_path=export_path, logger=MagicMock()) - - mock_get_export_cache_lock.assert_called() - self.assertEqual(mock_rq_job.retries_left, 1) - self.assertTrue(osp.isfile(export_path)) def test_cleanup_can_fail_if_no_file(self): - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - self.assertRaises(FileNotFoundError), - ): - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job - + with self.assertRaises(FileNotFoundError): clear_export_cache(file_path="non existent file path", logger=MagicMock()) def test_cleanup_can_defer_removal_if_file_is_used_recently(self): @@ -2128,28 +2047,17 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] - with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - ): - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - export_path = export(dst_format=format_name, task_id=task_id) + export_path = export(dst_format=format_name, task_id=task_id) from cvat.apps.dataset_manager.views import FileIsBeingUsedError with ( - patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(hours=1)}), self.assertRaises(FileIsBeingUsedError), ): - mock_rq_job = MagicMock(timeout=5) - mock_rq_get_current_job.return_value = mock_rq_job - export_path = export(dst_format=format_name, task_id=task_id) clear_export_cache(file_path=export_path, logger=MagicMock()) - self.assertEqual(mock_rq_job.retries_left, 1) self.assertTrue(osp.isfile(export_path)) class ProjectDumpUpload(_DbTestBase): diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 6316eefc512d..1df6384ec8b8 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -225,7 +225,6 @@ class FileIsBeingUsedError(Exception): # TODO: write a migration to delete all clear_export_cache scheduled jobs from scheduler def clear_export_cache(file_path: str, logger: logging.Logger) -> None: try: - # TODO: update after 8721 with get_export_cache_lock( file_path, block=True, @@ -233,7 +232,7 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: ttl=EXPORT_CACHE_LOCK_TTL, ): if not osp.exists(file_path): - logger.error("Export cache file '{}' doesn't exist".format(file_path)) + raise FileNotFoundError(f"Export cache file {file_path} doesn't exist") parsed_filename = ExportCacheManager.parse_file_path(file_path) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) From 81ec1fc842b7ac19c479e511e5d108c0af3cc08f Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 11:44:57 +0100 Subject: [PATCH 10/61] Update after merging #8721 --- cvat/apps/engine/background.py | 11 ++++--- cvat/apps/engine/backup.py | 28 +++++------------- cvat/apps/engine/tests/test_rest_api.py | 39 +++++++++++++------------ 3 files changed, 32 insertions(+), 46 deletions(-) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 97299bfa7e8f..41116b7c3a06 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -49,6 +49,7 @@ sendfile, ) from cvat.apps.events.handlers import handle_dataset_export +from cvat.apps.dataset_manager.util import extend_export_file_lifetime slogger = ServerLogManager(__name__) @@ -330,7 +331,7 @@ def handle_local_download() -> Response: acquire_timeout=LOCK_ACQUIRE_TIMEOUT, ): if osp.exists(file_path) and not is_result_outdated(): - dm.util.extend_export_file_lifetime(file_path) + extend_export_file_lifetime(file_path) return Response(status=status.HTTP_201_CREATED) @@ -611,8 +612,7 @@ def is_result_outdated() -> bool: ) if action == "download": - # TODO: update after 8721 - with dm.util.get_export_cache_lock(file_path, ttl=55, acquire_timeout=50): + with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): if not os.path.exists(file_path): return Response( "The backup file has been expired, please retry backing up", @@ -630,10 +630,9 @@ def is_result_outdated() -> bool: return sendfile( self.request, file_path, attachment=True, attachment_filename=filename ) - # TODO: update after 8721 - with dm.util.get_export_cache_lock(file_path, ttl=55, acquire_timeout=50): + with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): if osp.exists(file_path) and not is_result_outdated(): - # extend_export_file_lifetime(file_path) + extend_export_file_lifetime(file_path) return Response(status=status.HTTP_201_CREATED) cancel_and_delete(rq_job) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index b6ee6830652f..242db4a52c6c 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -56,6 +56,8 @@ from cvat.apps.engine.permissions import get_cloud_storage_for_import_or_export from cvat.apps.dataset_manager.views import log_exception from cvat.apps.dataset_manager.bindings import CvatImportError +from cvat.apps.dataset_manager.views import EXPORT_CACHE_LOCK_TTL, EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT +from cvat.apps.dataset_manager.util import extend_export_file_lifetime slogger = ServerLogManager(__name__) @@ -1042,14 +1044,12 @@ def create_backup( with get_export_cache_lock( output_path, block=True, - # TODO: update after merging #8721 (DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT, DATASET_EXPORT_LOCK_TTL) - acquire_timeout=60, - ttl=30, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + ttl=EXPORT_CACHE_LOCK_TTL, ): # output_path includes timestamp of the last update if os.path.exists(output_path): - # TODO: update after merging #8721 - # extend_export_file_lifetime(output_path) + extend_export_file_lifetime(output_path) return output_path with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: @@ -1060,9 +1060,8 @@ def create_backup( with get_export_cache_lock( output_path, block=True, - # TODO: update after merging #8721 (DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT, DATASET_EXPORT_LOCK_TTL) - acquire_timeout=60, - ttl=30, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + ttl=EXPORT_CACHE_LOCK_TTL, ): os.replace(temp_file, output_path) @@ -1228,16 +1227,3 @@ def import_task(request, queue_name, filename=None): location_conf=location_conf, filename=filename ) - -# TODO: delete function -def _clear_export_cache(file_path: str, file_ctime: float, logger: Logger) -> None: - try: - if os.path.exists(file_path) and os.path.getctime(file_path) == file_ctime: - os.remove(file_path) - - logger.info( - "Export cache file '{}' successfully removed" \ - .format(file_path)) - except Exception: - log_exception(logger) - raise diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index e6ed6b6c0303..91ea882a2a3d 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3088,31 +3088,32 @@ def test_api_v2_tasks_id_export_somebody(self): def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) - def test_can_remove_export_cache_automatically_after_successful_export(self): - self._create_tasks() - task_id = self.tasks[0]["id"] - user = self.admin + # TODO: add another test that checks running cron job + # def test_can_remove_export_cache_automatically_after_successful_export(self): + # self._create_tasks() + # task_id = self.tasks[0]["id"] + # user = self.admin - with mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(hours=10)): - response = self._run_api_v2_tasks_id_export(task_id, user) - self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + # with mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(hours=10)): + # response = self._run_api_v2_tasks_id_export(task_id, user) + # self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - response = self._run_api_v2_tasks_id_export(task_id, user) - self.assertEqual(response.status_code, status.HTTP_201_CREATED) + # response = self._run_api_v2_tasks_id_export(task_id, user) + # self.assertEqual(response.status_code, status.HTTP_201_CREATED) - scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.IMPORT_DATA.value) - scheduled_jobs = list(scheduler.get_jobs()) - cleanup_job = next( - j for j in scheduled_jobs if j.func_name.endswith('.engine.backup._clear_export_cache') - ) + # scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.IMPORT_DATA.value) + # scheduled_jobs = list(scheduler.get_jobs()) + # cleanup_job = next( + # j for j in scheduled_jobs if j.func_name.endswith('.engine.backup._clear_export_cache') + # ) - export_path = cleanup_job.kwargs['file_path'] - self.assertTrue(os.path.isfile(export_path)) + # export_path = cleanup_job.kwargs['file_path'] + # self.assertTrue(os.path.isfile(export_path)) - from cvat.apps.engine.backup import _clear_export_cache - _clear_export_cache(**cleanup_job.kwargs) + # from cvat.apps.engine.backup import _clear_export_cache + # _clear_export_cache(**cleanup_job.kwargs) - self.assertFalse(os.path.isfile(export_path)) + # self.assertFalse(os.path.isfile(export_path)) def generate_random_image_file(filename): From 0a6dbdc58293cd4daf92924ffad7bf8d331e6c96 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 12:08:11 +0100 Subject: [PATCH 11/61] Move touch_last_export_date() call into worker --- cvat/apps/dataset_manager/views.py | 2 ++ cvat/apps/engine/background.py | 4 ---- cvat/apps/engine/backup.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 1df6384ec8b8..a8e6af598034 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -135,6 +135,8 @@ def export( export_fn = task.export_job db_instance = Job.objects.get(pk=job_id) + db_instance.touch_last_export_date() + cache_ttl = get_export_cache_ttl(db_instance) cache_dir = db_instance.get_export_cache_directory(create=True) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 41116b7c3a06..23b5aa838b3c 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -493,8 +493,6 @@ def setup_background_job( failure_ttl=cache_ttl.total_seconds(), ) - self.db_instance.touch_last_export_date() - def get_v1_endpoint_view_name(self) -> str: """ Get view name of the endpoint for the first API version @@ -766,8 +764,6 @@ def setup_background_job( failure_ttl=cache_ttl.total_seconds(), ) - self.db_instance.touch_last_export_date() - def get_v1_endpoint_view_name(self) -> str: """Get view name of the endpoint for the first API version""" diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 242db4a52c6c..62786195a640 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -16,7 +16,6 @@ from tempfile import NamedTemporaryFile from typing import Any, Collection, Dict, Iterable, Optional, Union, Type from zipfile import ZipFile -import logging from datetime import timedelta import django_rq @@ -1031,11 +1030,12 @@ def create_backup( # FUTURE-FIXME: there db_instance_id should be passed db_instance: models.Project | models.Task, Exporter: Type[ProjectExporter | TaskExporter], - logger: logging.Logger, + logger: Logger, cache_ttl: timedelta, ): try: cache_dir = db_instance.get_export_cache_directory(create=True) + db_instance.touch_last_export_date() db_instance.refresh_from_db(fields=['updated_date']) instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() From 529b5c6d2cc68cacbb2b9a32e273001de42b473e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 13:45:13 +0100 Subject: [PATCH 12/61] [Unit tests] Check that cron job deletes files from cache --- .../tests/test_rest_api_formats.py | 46 +++++++++++++++++++ cvat/apps/engine/background.py | 10 ++-- cvat/apps/engine/tests/test_rest_api.py | 2 - 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 47437a1a6428..71363c700e56 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -2060,6 +2060,52 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): self.assertTrue(osp.isfile(export_path)) + def test_cleanup_cron_job_can_delete_cached_files(self): + from cvat.apps.dataset_manager.views import cron_export_cache_cleanup + + def _get_project_task_job_ids(): + project = self._create_project(projects["main"]) + project_id = project["id"] + + images = self._generate_task_images(3) + task = self._create_task( + data=tasks["task in project #1"], + image_data=images, + ) + task_id = task["id"] + job_id = self._get_jobs(task_id)[0]["id"] + return project_id, task_id, job_id + + # remove chunks from the cache + self._clear_temp_data() + project_id, task_id, job_id = _get_project_task_job_ids() + + for resource, rid in zip(("project", "task", "job"), (project_id, task_id, job_id)): + for save_images in (True, False): + export_path = export( + dst_format="CVAT for images 1.1", + save_images=save_images, + **{resource + "_id": rid}, + ) + self.assertTrue(osp.isfile(export_path)) + self.assertTrue(resource in export_path) + + with ( + patch( + "cvat.apps.dataset_manager.views.TTL_CONSTS", + new={resource: timedelta(seconds=0)}, + ), + patch( + "cvat.apps.dataset_manager.views.clear_export_cache", + side_effect=clear_export_cache, + ) as mock_clear_export_cache, + ): + cron_export_cache_cleanup(f"cvat.apps.engine.models.{resource.title()}") + mock_clear_export_cache.assert_called_once() + + self.assertFalse(osp.exists(export_path)) + + class ProjectDumpUpload(_DbTestBase): def _get_download_project_dataset_response(self, url, user, dump_format_name, edata): data = { diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 23b5aa838b3c..ba20d28774ce 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -23,6 +23,7 @@ from rq.job import JobStatus as RQJobStatus import cvat.apps.dataset_manager as dm +from cvat.apps.dataset_manager.util import extend_export_file_lifetime from cvat.apps.engine import models from cvat.apps.engine.backup import ProjectExporter, TaskExporter, create_backup from cvat.apps.engine.cloud_provider import export_resource_to_cloud_storage @@ -49,7 +50,6 @@ sendfile, ) from cvat.apps.events.handlers import handle_dataset_export -from cvat.apps.dataset_manager.util import extend_export_file_lifetime slogger = ServerLogManager(__name__) @@ -610,7 +610,9 @@ def is_result_outdated() -> bool: ) if action == "download": - with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): + with dm.util.get_export_cache_lock( + file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT + ): if not os.path.exists(file_path): return Response( "The backup file has been expired, please retry backing up", @@ -628,7 +630,9 @@ def is_result_outdated() -> bool: return sendfile( self.request, file_path, attachment=True, attachment_filename=filename ) - with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): + with dm.util.get_export_cache_lock( + file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT + ): if osp.exists(file_path) and not is_result_outdated(): extend_export_file_lifetime(file_path) return Response(status=status.HTTP_201_CREATED) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 91ea882a2a3d..88fa4cbdac36 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT from contextlib import ExitStack -from datetime import timedelta import io from itertools import product import os @@ -25,7 +24,6 @@ import json import av -import django_rq import numpy as np from pdf2image import convert_from_bytes from pyunpack import Archive From 32ef451beddaa43925fddca25486f5329a232cee Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 15:16:42 +0100 Subject: [PATCH 13/61] Move functions into separate engine module --- .../tests/test_rest_api_formats.py | 23 ++-- cvat/apps/dataset_manager/views.py | 79 ------------- cvat/apps/engine/cron.py | 104 ++++++++++++++++++ 3 files changed, 116 insertions(+), 90 deletions(-) create mode 100644 cvat/apps/engine/cron.py diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 71363c700e56..008de5f52b59 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -35,9 +35,10 @@ from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.tests.utils import TestDir from cvat.apps.dataset_manager.util import get_export_cache_lock -from cvat.apps.dataset_manager.views import clear_export_cache, export +from cvat.apps.dataset_manager.views import export from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin +from cvat.apps.engine.cron import clear_export_cache projects_path = osp.join(osp.dirname(__file__), 'assets', 'projects.json') with open(projects_path) as file: @@ -1498,13 +1499,13 @@ def patched_log_exception(logger=None, exc_info=True): def _clear(*_, file_path: str): from os import remove as original_remove - from cvat.apps.dataset_manager.views import FileIsBeingUsedError + from cvat.apps.engine.cron import FileIsBeingUsedError with ( - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), + patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), + patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), patch( - "cvat.apps.dataset_manager.views.get_export_cache_lock", + "cvat.apps.engine.cron.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), patch( @@ -1686,12 +1687,12 @@ def _clear(*_, file_path: str): from cvat.apps.dataset_manager.util import LockNotAvailableError with ( - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=3), + patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=3), patch( - "cvat.apps.dataset_manager.views.get_export_cache_lock", + "cvat.apps.engine.cron.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), - patch("cvat.apps.dataset_manager.views.os.remove") as mock_os_remove, + patch("cvat.apps.engine.cron.os.remove") as mock_os_remove, patch( "cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)} ), @@ -2049,7 +2050,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): export_path = export(dst_format=format_name, task_id=task_id) - from cvat.apps.dataset_manager.views import FileIsBeingUsedError + from cvat.apps.engine.cron import FileIsBeingUsedError with ( patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(hours=1)}), @@ -2061,7 +2062,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): self.assertTrue(osp.isfile(export_path)) def test_cleanup_cron_job_can_delete_cached_files(self): - from cvat.apps.dataset_manager.views import cron_export_cache_cleanup + from cvat.apps.engine.cron import cron_export_cache_cleanup def _get_project_task_job_ids(): project = self._create_project(projects["main"]) @@ -2096,7 +2097,7 @@ def _get_project_task_job_ids(): new={resource: timedelta(seconds=0)}, ), patch( - "cvat.apps.dataset_manager.views.clear_export_cache", + "cvat.apps.engine.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, ): diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index a8e6af598034..1c6a56dff8cb 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -9,15 +9,12 @@ import tempfile from datetime import timedelta -import importlib import django_rq import rq from os.path import exists as osp_exists from django.conf import settings from django.utils import timezone from rq_scheduler import Scheduler -from pathlib import Path -from contextlib import suppress import cvat.apps.dataset_manager.project as project import cvat.apps.dataset_manager.task as task @@ -26,7 +23,6 @@ from cvat.apps.engine.utils import get_rq_lock_by_user from cvat.apps.engine.rq_job_handler import RQMeta -from django.db.models import QuerySet from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( LockNotAvailableError, @@ -220,81 +216,6 @@ def export_project_as_dataset(project_id: int, dst_format: str, *, server_url: s def export_project_annotations(project_id: int, dst_format: str, *, server_url: str | None = None): return export(dst_format=dst_format, project_id=project_id, server_url=server_url, save_images=False) - -class FileIsBeingUsedError(Exception): - pass - -# TODO: write a migration to delete all clear_export_cache scheduled jobs from scheduler -def clear_export_cache(file_path: str, logger: logging.Logger) -> None: - try: - with get_export_cache_lock( - file_path, - block=True, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, - ttl=EXPORT_CACHE_LOCK_TTL, - ): - if not osp.exists(file_path): - raise FileNotFoundError(f"Export cache file {file_path} doesn't exist") - - parsed_filename = ExportCacheManager.parse_file_path(file_path) - cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) - - if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - logger.info( - "Cache file '{}' is recently accessed".format(file_path) - ) - raise FileIsBeingUsedError - - os.remove(file_path) - logger.debug(f"Export cache file {file_path!r} successfully removed") - except LockNotAvailableError: - logger.info( - f"Failed to acquire export cache lock for the file: {file_path}." - ) - raise - except Exception: - log_exception(logger) - raise - -# todo: move into engine -def cron_export_cache_cleanup(path_to_model: str) -> None: - assert isinstance(path_to_model, str) - - started_at = timezone.now() - module_name, model_name = path_to_model.rsplit('.', 1) - module = importlib.import_module(module_name) - ModelClass = getattr(module, model_name) - assert ModelClass in (Project, Task, Job) - - logger = ServerLogManager(__name__).glob - - one_month_ago = timezone.now() - timedelta(days=30) - queryset: QuerySet[Project | Task | Job] = ModelClass.objects.filter(last_export_date__gte=one_month_ago) - - for instance in queryset.iterator(): - instance_dir_path = Path(instance.get_dirname()) - export_cache_dir_path = Path(instance.get_export_cache_directory()) - - if not export_cache_dir_path.exists(): - logger.debug(f"The {export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping...") - continue - - for child in export_cache_dir_path.iterdir(): - # export cache dir may contain temporary directories - if not child.is_file(): - logger.warning(f'The {child.relative_to(instance_dir_path)} is not a file, skipping...') - continue - - with suppress(Exception): - clear_export_cache(child, logger) - - finished_at = timezone.now() - logger.info( - f"Clearing the {model_name}'s export cache has been successfully " - f"completed after {(finished_at - started_at).total_seconds()} seconds..." - ) - - def get_export_formats(): return list(EXPORT_FORMATS.values()) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py new file mode 100644 index 000000000000..5f6fad61bb96 --- /dev/null +++ b/cvat/apps/engine/cron.py @@ -0,0 +1,104 @@ +# Copyright (C) 2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +import importlib +import logging +import os +import os.path as osp +from contextlib import suppress +from datetime import timedelta +from pathlib import Path + +from django.db.models import QuerySet +from django.utils import timezone + +from cvat.apps.dataset_manager.util import ( + ExportCacheManager, + LockNotAvailableError, + get_export_cache_lock, +) +from cvat.apps.dataset_manager.views import ( + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + EXPORT_CACHE_LOCK_TTL, + get_export_cache_ttl, + log_exception, +) +from cvat.apps.engine.log import ServerLogManager +from cvat.apps.engine.models import Job, Project, Task + + +class FileIsBeingUsedError(Exception): + pass + + +def clear_export_cache(file_path: str, logger: logging.Logger) -> None: + try: + with get_export_cache_lock( + file_path, + block=True, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + ttl=EXPORT_CACHE_LOCK_TTL, + ): + if not osp.exists(file_path): + raise FileNotFoundError(f"Export cache file {file_path} doesn't exist") + + parsed_filename = ExportCacheManager.parse_file_path(file_path) + cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) + + if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): + logger.info("Cache file '{}' is recently accessed".format(file_path)) + raise FileIsBeingUsedError + + os.remove(file_path) + logger.debug(f"Export cache file {file_path!r} successfully removed") + except LockNotAvailableError: + logger.info(f"Failed to acquire export cache lock for the file: {file_path}.") + raise + except Exception: + log_exception(logger) + raise + + +def cron_export_cache_cleanup(path_to_model: str) -> None: + assert isinstance(path_to_model, str) + + started_at = timezone.now() + module_name, model_name = path_to_model.rsplit(".", 1) + module = importlib.import_module(module_name) + ModelClass = getattr(module, model_name) + assert ModelClass in (Project, Task, Job) + + logger = ServerLogManager(__name__).glob + + one_month_ago = timezone.now() - timedelta(days=30) + queryset: QuerySet[Project | Task | Job] = ModelClass.objects.filter( + last_export_date__gte=one_month_ago + ) + + for instance in queryset.iterator(): + instance_dir_path = Path(instance.get_dirname()) + export_cache_dir_path = Path(instance.get_export_cache_directory()) + + if not export_cache_dir_path.exists(): + logger.debug( + f"The {export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." + ) + continue + + for child in export_cache_dir_path.iterdir(): + # export cache dir may contain temporary directories + if not child.is_file(): + logger.debug( + f"The {child.relative_to(instance_dir_path)} is not a file, skipping..." + ) + continue + + with suppress(Exception): + clear_export_cache(child, logger) + + finished_at = timezone.now() + logger.info( + f"Clearing the {model_name.lower()}'s export cache has been successfully " + f"completed after {int((finished_at - started_at).total_seconds())} seconds..." + ) From 504cd809f6f61e9fe0aabc8c2f406e6dc50fecda Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 15:17:46 +0100 Subject: [PATCH 14/61] Run project|task|job export cache cleaning at different times --- cvat/settings/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 84fd487d2dfc..08bd5c6d666d 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -357,13 +357,15 @@ class CVAT_QUEUES(Enum): { 'queue': CVAT_QUEUES.CLEANING.value, 'id': f'cron_{model.lower()}_export_cache_cleanup', - 'func': 'cvat.apps.dataset_manager.views.cron_export_cache_cleanup', + 'func': 'cvat.apps.engine.cron.cron_export_cache_cleanup', # Run once a day at midnight - 'cron_string': '0 0 * * *', - # 'cron_string': '05 17 * * *', + 'cron_string': cron_string, 'args': (f'cvat.apps.engine.models.{model.title()}',), } - for model in ('project', 'task', 'job') + for model, cron_string in zip( + ('project', 'task', 'job'), + ('0 0 * * *', '0 6 * * *', '0 12 * * *') + ) ), ] From f505335c650fdf88e35a7f4d25da29679eadbbcb Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 20 Dec 2024 15:18:45 +0100 Subject: [PATCH 15/61] Add new module to dev/format_python_code.sh --- dev/format_python_code.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index db18ce328dc4..81f2dbeb64ca 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -27,6 +27,7 @@ for paths in \ "cvat/apps/engine/background.py" \ "cvat/apps/engine/frame_provider.py" \ "cvat/apps/engine/cache.py" \ + "cvat/apps/engine/cron.py" \ "cvat/apps/engine/default_settings.py" \ "cvat/apps/engine/field_validation.py" \ "cvat/apps/engine/model_utils.py" \ From 42c82f26e043a09f26183a2c670bdfc923f14fa1 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 23 Dec 2024 13:10:41 +0100 Subject: [PATCH 16/61] Update test_can_remove_export_cache_automatically_after_successful_export --- cvat/apps/dataset_manager/util.py | 9 +--- cvat/apps/engine/models.py | 4 +- cvat/apps/engine/tests/test_rest_api.py | 55 ++++++++++++++++--------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index a5dd46c29154..54dfb1243466 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -249,18 +249,13 @@ def parse_file_path( if file_type in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): basename_match = re.fullmatch( - ( - # optional for backward compatibility - r"(?:instance(?P\d+\.\d+)-|_)" - r"(?P.+)" # TODO: convert back? - r"\.(?P.+)" - ), + r"instance(?P\d+\.\d+)(?P.+)\.(?P.+)", non_parsed_basename, ) ParsedFileNameClass = ParsedDatasetFilename elif file_type == ExportFileType.BACKUP: basename_match = re.fullmatch( - r"(?:instance(?P\d+\.\d+)-|_)" r"\.(?P.+)", + r"instance(?P\d+\.\d+)\.(?P.+)", non_parsed_basename, ) ParsedFileNameClass = ParsedBackupFilename diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 377db1275e8e..7e35cbcde6ff 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -527,7 +527,7 @@ def get_labels(self, prefetch=False): 'attributespec_set', 'sublabels__attributespec_set', ) if prefetch else queryset - def get_dirname(self): + def get_dirname(self) -> str: return os.path.join(settings.PROJECTS_ROOT, str(self.id)) def is_job_staff(self, user_id): @@ -896,7 +896,7 @@ def get_target_storage(self) -> Optional[Storage]: def get_source_storage(self) -> Optional[Storage]: return self.segment.task.source_storage - def get_dirname(self): + def get_dirname(self) -> str: return os.path.join(settings.JOBS_ROOT, str(self.id)) @extend_schema_field(OpenApiTypes.INT) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 88fa4cbdac36..a74154aeb031 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT from contextlib import ExitStack +import django_rq import io from itertools import product import os @@ -27,6 +28,7 @@ import numpy as np from pdf2image import convert_from_bytes from pyunpack import Archive +from datetime import timedelta from django.conf import settings from django.contrib.auth.models import Group, User from django.http import HttpResponse @@ -34,6 +36,8 @@ from pycocotools import coco as coco_loader from rest_framework import status from rest_framework.test import APIClient +from rq.job import Job as RQJob +from rq.queue import Queue as RQQueue from cvat.apps.dataset_manager.tests.utils import TestDir from cvat.apps.dataset_manager.util import current_function_name @@ -3086,32 +3090,43 @@ def test_api_v2_tasks_id_export_somebody(self): def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) - # TODO: add another test that checks running cron job - # def test_can_remove_export_cache_automatically_after_successful_export(self): - # self._create_tasks() - # task_id = self.tasks[0]["id"] - # user = self.admin + def test_can_remove_export_cache_automatically_after_successful_export(self): + from cvat.apps.engine.cron import cron_export_cache_cleanup, clear_export_cache + self._create_tasks() + task_id = self.tasks[0]["id"] + user = self.admin - # with mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(hours=10)): - # response = self._run_api_v2_tasks_id_export(task_id, user) - # self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + TASK_CACHE_TTL = timedelta(seconds=5) + with ( + mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=TASK_CACHE_TTL), + mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': TASK_CACHE_TTL}), + mock.patch( + "cvat.apps.engine.cron.clear_export_cache", + side_effect=clear_export_cache, + ) as mock_clear_export_cache, + ): + cron_export_cache_cleanup(f"cvat.apps.engine.models.Task") + mock_clear_export_cache.assert_not_called() - # response = self._run_api_v2_tasks_id_export(task_id, user) - # self.assertEqual(response.status_code, status.HTTP_201_CREATED) + response = self._run_api_v2_tasks_id_export(task_id, user) + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - # scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.IMPORT_DATA.value) - # scheduled_jobs = list(scheduler.get_jobs()) - # cleanup_job = next( - # j for j in scheduled_jobs if j.func_name.endswith('.engine.backup._clear_export_cache') - # ) + response = self._run_api_v2_tasks_id_export(task_id, user) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) - # export_path = cleanup_job.kwargs['file_path'] - # self.assertTrue(os.path.isfile(export_path)) + queue: RQQueue = django_rq.get_queue(settings.CVAT_QUEUES.EXPORT_DATA.value) + rq_job_ids = queue.finished_job_registry.get_job_ids() + self.assertEqual(len(rq_job_ids), 1) + job: RQJob | None = queue.fetch_job(rq_job_ids[0]) + self.assertFalse(job is None) + file_path = job.return_value() + self.assertTrue(os.path.isfile(file_path)) - # from cvat.apps.engine.backup import _clear_export_cache - # _clear_export_cache(**cleanup_job.kwargs) + sleep(TASK_CACHE_TTL.total_seconds() + 1) - # self.assertFalse(os.path.isfile(export_path)) + cron_export_cache_cleanup(f"cvat.apps.engine.models.Task") + mock_clear_export_cache.assert_called_once() + self.assertFalse(os.path.exists(file_path)) def generate_random_image_file(filename): From 59536b619bb2be8080ba96bb3405a2c0e1f124da Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 09:34:04 +0100 Subject: [PATCH 17/61] Apply a few comments --- .../tests/test_rest_api_formats.py | 21 +++---- cvat/apps/dataset_manager/util.py | 9 ++- cvat/apps/engine/cron.py | 63 +++++++------------ 3 files changed, 38 insertions(+), 55 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 008de5f52b59..04ce6d3bd4ae 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1499,8 +1499,6 @@ def patched_log_exception(logger=None, exc_info=True): def _clear(*_, file_path: str): from os import remove as original_remove - from cvat.apps.engine.cron import FileIsBeingUsedError - with ( patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), @@ -1521,12 +1519,10 @@ def _clear(*_, file_path: str): side_effect(set_condition, clear_removed_the_file), ) - try: - clear_export_cache( - file_path=file_path, logger=MagicMock() - ) - except FileIsBeingUsedError: - set_condition(clear_has_been_finished) + clear_export_cache( + file_path=file_path, logger=MagicMock() + ) + set_condition(clear_has_been_finished) mock_os_remove.assert_not_called() @@ -2040,24 +2036,25 @@ def test_cleanup_can_remove_file(self): def test_cleanup_can_fail_if_no_file(self): - with self.assertRaises(FileNotFoundError): + from cvat.apps.dataset_manager.util import CacheFilePathParseError + with self.assertRaises(CacheFilePathParseError): clear_export_cache(file_path="non existent file path", logger=MagicMock()) def test_cleanup_can_defer_removal_if_file_is_used_recently(self): + from os import remove as original_remove format_name = "CVAT for images 1.1" task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] export_path = export(dst_format=format_name, task_id=task_id) - from cvat.apps.engine.cron import FileIsBeingUsedError - with ( patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(hours=1)}), - self.assertRaises(FileIsBeingUsedError), + patch("cvat.apps.engine.cron.os.remove", side_effect=original_remove) as mock_os_remove, ): export_path = export(dst_format=format_name, task_id=task_id) clear_export_cache(file_path=export_path, logger=MagicMock()) + mock_os_remove.assert_not_called() self.assertTrue(osp.isfile(export_path)) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 54dfb1243466..d8739b3029c9 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -103,6 +103,9 @@ def faster_deepcopy(v): class LockNotAvailableError(Exception): pass +class CacheFilePathParseError(Exception): + pass + def make_export_cache_lock_key(filename: os.PathLike[str]) -> str: return f"export_lock:{os.fspath(filename)}" @@ -237,7 +240,7 @@ def parse_file_path( rf"/(jobs|tasks|projects)/\d+/{settings.EXPORT_CACHE_DIR_NAME}$", dirname ) if not dirname_match: - raise ValueError(f"Couldn't parse instance type in '{dirname}'") + raise CacheFilePathParseError(f"Couldn't parse instance type in '{dirname}'") instance_type_names = dirname_match.group(1) assert instance_type_names in {"projects", "tasks", "jobs"} @@ -260,10 +263,10 @@ def parse_file_path( ) ParsedFileNameClass = ParsedBackupFilename else: - raise ValueError(f"Unsupported file type: {file_type!r}") + raise CacheFilePathParseError(f"Unsupported file type: {file_type!r}") if not basename_match: - raise ValueError(f"Couldn't parse filename components in '{basename}'") + raise CacheFilePathParseError(f"Couldn't parse filename components in '{basename}'") fragments = basename_match.groupdict() diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 5f6fad61bb96..8466ab581e0b 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -6,18 +6,13 @@ import logging import os import os.path as osp -from contextlib import suppress from datetime import timedelta from pathlib import Path from django.db.models import QuerySet from django.utils import timezone -from cvat.apps.dataset_manager.util import ( - ExportCacheManager, - LockNotAvailableError, - get_export_cache_lock, -) +from cvat.apps.dataset_manager.util import ExportCacheManager, get_export_cache_lock from cvat.apps.dataset_manager.views import ( EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, EXPORT_CACHE_LOCK_TTL, @@ -27,37 +22,25 @@ from cvat.apps.engine.log import ServerLogManager from cvat.apps.engine.models import Job, Project, Task - -class FileIsBeingUsedError(Exception): - pass +logger = ServerLogManager(__name__).glob def clear_export_cache(file_path: str, logger: logging.Logger) -> None: - try: - with get_export_cache_lock( - file_path, - block=True, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, - ttl=EXPORT_CACHE_LOCK_TTL, - ): - if not osp.exists(file_path): - raise FileNotFoundError(f"Export cache file {file_path} doesn't exist") - - parsed_filename = ExportCacheManager.parse_file_path(file_path) - cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) - - if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - logger.info("Cache file '{}' is recently accessed".format(file_path)) - raise FileIsBeingUsedError - - os.remove(file_path) - logger.debug(f"Export cache file {file_path!r} successfully removed") - except LockNotAvailableError: - logger.info(f"Failed to acquire export cache lock for the file: {file_path}.") - raise - except Exception: - log_exception(logger) - raise + with get_export_cache_lock( + file_path, + block=True, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + ttl=EXPORT_CACHE_LOCK_TTL, + ): + parsed_filename = ExportCacheManager.parse_file_path(file_path) + cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) + + if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): + logger.info("Cache file '{}' is recently accessed".format(file_path)) + return + + os.remove(file_path) + logger.debug(f"Export cache file {file_path!r} successfully removed") def cron_export_cache_cleanup(path_to_model: str) -> None: @@ -69,8 +52,6 @@ def cron_export_cache_cleanup(path_to_model: str) -> None: ModelClass = getattr(module, model_name) assert ModelClass in (Project, Task, Job) - logger = ServerLogManager(__name__).glob - one_month_ago = timezone.now() - timedelta(days=30) queryset: QuerySet[Project | Task | Job] = ModelClass.objects.filter( last_export_date__gte=one_month_ago @@ -82,7 +63,7 @@ def cron_export_cache_cleanup(path_to_model: str) -> None: if not export_cache_dir_path.exists(): logger.debug( - f"The {export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." + f"{export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." ) continue @@ -94,11 +75,13 @@ def cron_export_cache_cleanup(path_to_model: str) -> None: ) continue - with suppress(Exception): + try: clear_export_cache(child, logger) + except Exception: + log_exception(logger) finished_at = timezone.now() logger.info( - f"Clearing the {model_name.lower()}'s export cache has been successfully " - f"completed after {int((finished_at - started_at).total_seconds())} seconds..." + f"Clearing the {model_name.lower()} export cache has been successfully " + f"completed after {int((finished_at - started_at).total_seconds())} seconds." ) From 685bdf437b94dbc71edb3aa73e2d3dbc93c9e020 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 14:01:56 +0100 Subject: [PATCH 18/61] Apply comments --- .../tests/test_rest_api_formats.py | 2 +- cvat/apps/dataset_manager/util.py | 36 +++++----- cvat/apps/engine/cron.py | 71 +++++++++---------- .../management/commands/syncperiodicjobs.py | 44 ++++++++---- cvat/apps/engine/tests/test_rest_api.py | 4 +- dev/format_python_code.sh | 1 + 6 files changed, 85 insertions(+), 73 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 04ce6d3bd4ae..45fc2adc5b00 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -2098,7 +2098,7 @@ def _get_project_task_job_ids(): side_effect=clear_export_cache, ) as mock_clear_export_cache, ): - cron_export_cache_cleanup(f"cvat.apps.engine.models.{resource.title()}") + cron_export_cache_cleanup() mock_clear_export_cache.assert_called_once() self.assertFalse(osp.exists(export_path)) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index d8739b3029c9..0a530d293183 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -247,36 +247,38 @@ def parse_file_path( instance_type_name = instance_type_names[:-1] # handle file name - file_type, non_parsed_basename = basename.split("-", maxsplit=1) + file_type, unparsed = basename.split("-", maxsplit=1) file_type = ExportFileType(file_type) + unparsed, file_ext = osp.splitext(unparsed) + unparsed = unparsed[len('instance'):] + specific_params = {} + if file_type in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): - basename_match = re.fullmatch( - r"instance(?P\d+\.\d+)(?P.+)\.(?P.+)", - non_parsed_basename, - ) + try: + instance_timestamp, format_repr = unparsed.split("-", maxsplit=1) + except ValueError: + raise CacheFilePathParseError(f"Couldn't parse file name: '{basename}'") + + specific_params["format_repr"] = format_repr ParsedFileNameClass = ParsedDatasetFilename elif file_type == ExportFileType.BACKUP: - basename_match = re.fullmatch( - r"instance(?P\d+\.\d+)\.(?P.+)", - non_parsed_basename, - ) + instance_timestamp = unparsed ParsedFileNameClass = ParsedBackupFilename else: raise CacheFilePathParseError(f"Unsupported file type: {file_type!r}") - if not basename_match: - raise CacheFilePathParseError(f"Couldn't parse filename components in '{basename}'") - - fragments = basename_match.groupdict() - - if fragments.get("instance_timestamp"): - fragments["instance_timestamp"] = float(fragments["instance_timestamp"]) + try: + instance_timestamp = float(instance_timestamp) + except ValueError: + raise CacheFilePathParseError(f"Couldn't parse instance timestamp: '{instance_timestamp}'") return ParsedFileNameClass( file_type=file_type.value, + file_ext=file_ext, instance_type=instance_type_name, - **fragments, + instance_timestamp=instance_timestamp, + **specific_params, ) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 8466ab581e0b..bf379bd57e1d 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -43,45 +43,40 @@ def clear_export_cache(file_path: str, logger: logging.Logger) -> None: logger.debug(f"Export cache file {file_path!r} successfully removed") -def cron_export_cache_cleanup(path_to_model: str) -> None: - assert isinstance(path_to_model, str) - - started_at = timezone.now() - module_name, model_name = path_to_model.rsplit(".", 1) - module = importlib.import_module(module_name) - ModelClass = getattr(module, model_name) - assert ModelClass in (Project, Task, Job) - - one_month_ago = timezone.now() - timedelta(days=30) - queryset: QuerySet[Project | Task | Job] = ModelClass.objects.filter( - last_export_date__gte=one_month_ago - ) - - for instance in queryset.iterator(): - instance_dir_path = Path(instance.get_dirname()) - export_cache_dir_path = Path(instance.get_export_cache_directory()) - - if not export_cache_dir_path.exists(): - logger.debug( - f"{export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." - ) - continue - - for child in export_cache_dir_path.iterdir(): - # export cache dir may contain temporary directories - if not child.is_file(): +def cron_export_cache_cleanup() -> None: + for Model in (Project, Task, Job): + started_at = timezone.now() + one_month_ago = timezone.now() - timedelta(days=30) + queryset: QuerySet[Project | Task | Job] = Model.objects.filter( + last_export_date__gte=one_month_ago + ) + + for instance in queryset.iterator(): + instance_dir_path = Path(instance.get_dirname()) + export_cache_dir_path = Path(instance.get_export_cache_directory()) + + if not export_cache_dir_path.exists(): logger.debug( - f"The {child.relative_to(instance_dir_path)} is not a file, skipping..." + f"{export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." ) continue - try: - clear_export_cache(child, logger) - except Exception: - log_exception(logger) - - finished_at = timezone.now() - logger.info( - f"Clearing the {model_name.lower()} export cache has been successfully " - f"completed after {int((finished_at - started_at).total_seconds())} seconds." - ) + for child in export_cache_dir_path.iterdir(): + # TODO: write into a file about each file that should be removed manually or separately + # export cache dir may contain temporary directories + if not child.is_file(): + logger.debug( + f"The {child.relative_to(instance_dir_path)} is not a file, skipping..." + ) + continue + + try: + clear_export_cache(child, logger) + except Exception: + log_exception(logger) + + finished_at = timezone.now() + logger.info( + f"Clearing the {Model.__class__.__name__.lower()} export cache has been successfully " + f"completed after {int((finished_at - started_at).total_seconds())} seconds." + ) diff --git a/cvat/apps/engine/management/commands/syncperiodicjobs.py b/cvat/apps/engine/management/commands/syncperiodicjobs.py index ecabcc11e19f..3463b71c5359 100644 --- a/cvat/apps/engine/management/commands/syncperiodicjobs.py +++ b/cvat/apps/engine/management/commands/syncperiodicjobs.py @@ -5,25 +5,28 @@ from argparse import ArgumentParser from collections import defaultdict -from django.core.management.base import BaseCommand +import django_rq from django.conf import settings +from django.core.management.base import BaseCommand +from rq.job import Job as RQJob -import django_rq class Command(BaseCommand): help = "Synchronize periodic jobs in Redis with the project configuration" - _PERIODIC_JOBS_KEY_PREFIX = 'cvat:utils:periodic-jobs:' + _PERIODIC_JOBS_KEY_PREFIX = "cvat:utils:periodic-jobs:" def add_arguments(self, parser: ArgumentParser) -> None: - parser.add_argument('--clear', action='store_true', help='Remove jobs from Redis instead of updating them') + parser.add_argument( + "--clear", action="store_true", help="Remove jobs from Redis instead of updating them" + ) def handle(self, *args, **options): configured_jobs = defaultdict(dict) if not options["clear"]: for job in settings.PERIODIC_RQ_JOBS: - configured_jobs[job['queue']][job['id']] = job + configured_jobs[job["queue"]][job["id"]] = job for queue_name in settings.RQ_QUEUES: self.stdout.write(f"Processing queue {queue_name}...") @@ -34,7 +37,7 @@ def handle(self, *args, **options): scheduler = django_rq.get_scheduler(queue_name, queue=queue) stored_jobs_for_queue = { - member.decode('UTF-8') for member in queue.connection.smembers(periodic_jobs_key) + member.decode("UTF-8") for member in queue.connection.smembers(periodic_jobs_key) } configured_jobs_for_queue = configured_jobs[queue_name] @@ -49,15 +52,26 @@ def handle(self, *args, **options): queue.connection.srem(periodic_jobs_key, job_id) + def is_job_actual(job: RQJob, job_definition: dict): + return ( + job.func_name == job_definition["func"] + and job.meta.get("cron_string") == job_definition["cron_string"] + and ( + not (job.args or job_definition.get("args")) + or job.args == job_definition.get("args") + ) + and ( + not (job.kwargs or job_definition.get("kwargs")) + or job.kwargs == job_definition.get("kwargs") + ) + ) + # Add/update jobs from the configuration for job_definition in configured_jobs_for_queue.values(): - job_id = job_definition['id'] + job_id = job_definition["id"] if job := queue.fetch_job(job_id): - if ( - job.func_name == job_definition['func'] - and job.meta.get('cron_string') == job_definition['cron_string'] - ): + if is_job_actual(job, job_definition): self.stdout.write(f"Job {job_id} is unchanged") queue.connection.sadd(periodic_jobs_key, job_id) continue @@ -68,11 +82,11 @@ def handle(self, *args, **options): self.stdout.write(f"Creating job {job_id}...") scheduler.cron( - cron_string=job_definition['cron_string'], - func=job_definition['func'], + cron_string=job_definition["cron_string"], + func=job_definition["func"], id=job_id, - args=job_definition.get('args'), - kwargs=job_definition.get('kwargs'), + args=job_definition.get("args"), + kwargs=job_definition.get("kwargs"), ) queue.connection.sadd(periodic_jobs_key, job_id) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index a74154aeb031..e47fff0b61a2 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3105,7 +3105,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): side_effect=clear_export_cache, ) as mock_clear_export_cache, ): - cron_export_cache_cleanup(f"cvat.apps.engine.models.Task") + cron_export_cache_cleanup() mock_clear_export_cache.assert_not_called() response = self._run_api_v2_tasks_id_export(task_id, user) @@ -3124,7 +3124,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): sleep(TASK_CACHE_TTL.total_seconds() + 1) - cron_export_cache_cleanup(f"cvat.apps.engine.models.Task") + cron_export_cache_cleanup() mock_clear_export_cache.assert_called_once() self.assertFalse(os.path.exists(file_path)) diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index 81f2dbeb64ca..f3ae0b63c315 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -35,6 +35,7 @@ for paths in \ "cvat/apps/dataset_manager/tests/test_annotation.py" \ "cvat/apps/dataset_manager/tests/utils.py" \ "cvat/apps/events/signals.py" \ + "cvat/apps/engine/management/commands/syncperiodicjobs.py" \ ; do ${BLACK} -- ${paths} ${ISORT} -- ${paths} From 52b074645083f818501f4216e7f942ca1c45a205 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 14:34:26 +0100 Subject: [PATCH 19/61] Update cvat/apps/engine/cron.py Co-authored-by: Roman Donchenko --- cvat/apps/engine/cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index bf379bd57e1d..f9ce888780d9 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -25,7 +25,7 @@ logger = ServerLogManager(__name__).glob -def clear_export_cache(file_path: str, logger: logging.Logger) -> None: +def clear_export_cache(file_path: str) -> None: with get_export_cache_lock( file_path, block=True, From f830cbb6c7fc92011cba8bce13757293e7b2cc6b Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 14:36:40 +0100 Subject: [PATCH 20/61] Do not pass logger arg into clear_export_cache --- .../dataset_manager/tests/test_rest_api_formats.py | 14 +++++--------- cvat/apps/engine/cron.py | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 45fc2adc5b00..a59289cafa1b 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1519,9 +1519,7 @@ def _clear(*_, file_path: str): side_effect(set_condition, clear_removed_the_file), ) - clear_export_cache( - file_path=file_path, logger=MagicMock() - ) + clear_export_cache(file_path=file_path) set_condition(clear_has_been_finished) mock_os_remove.assert_not_called() @@ -1700,9 +1698,7 @@ def _clear(*_, file_path: str): exited_by_timeout = False try: - clear_export_cache( - file_path=file_path, logger=MagicMock() - ) + clear_export_cache(file_path=file_path) except LockNotAvailableError: # should come from waiting for get_export_cache_lock exited_by_timeout = True @@ -2030,7 +2026,7 @@ def test_cleanup_can_remove_file(self): patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): export_path = export(dst_format=format_name, task_id=task_id) - clear_export_cache(file_path=export_path, logger=MagicMock()) + clear_export_cache(file_path=export_path) self.assertFalse(osp.isfile(export_path)) @@ -2038,7 +2034,7 @@ def test_cleanup_can_remove_file(self): def test_cleanup_can_fail_if_no_file(self): from cvat.apps.dataset_manager.util import CacheFilePathParseError with self.assertRaises(CacheFilePathParseError): - clear_export_cache(file_path="non existent file path", logger=MagicMock()) + clear_export_cache(file_path="non existent file path") def test_cleanup_can_defer_removal_if_file_is_used_recently(self): from os import remove as original_remove @@ -2053,7 +2049,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): patch("cvat.apps.engine.cron.os.remove", side_effect=original_remove) as mock_os_remove, ): export_path = export(dst_format=format_name, task_id=task_id) - clear_export_cache(file_path=export_path, logger=MagicMock()) + clear_export_cache(file_path=export_path) mock_os_remove.assert_not_called() self.assertTrue(osp.isfile(export_path)) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index f9ce888780d9..0225b9bc6f27 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -71,7 +71,7 @@ def cron_export_cache_cleanup() -> None: continue try: - clear_export_cache(child, logger) + clear_export_cache(child) except Exception: log_exception(logger) From 7962751a2ef0679d4fa47df1aaec48eba3e4beb5 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 14:40:57 +0100 Subject: [PATCH 21/61] Remove unused imports --- cvat/apps/engine/cron.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 0225b9bc6f27..72ea34372c98 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: MIT -import importlib -import logging import os import os.path as osp from datetime import timedelta From 5b5d0fb03df093b675dcc8964f52f7bfa912aec5 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 15:13:03 +0100 Subject: [PATCH 22/61] Add chnagelog --- changelog.d/20241224_150942_maria_clear_cache_cron_job.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog.d/20241224_150942_maria_clear_cache_cron_job.md diff --git a/changelog.d/20241224_150942_maria_clear_cache_cron_job.md b/changelog.d/20241224_150942_maria_clear_cache_cron_job.md new file mode 100644 index 000000000000..52ab9046fc97 --- /dev/null +++ b/changelog.d/20241224_150942_maria_clear_cache_cron_job.md @@ -0,0 +1,4 @@ +### Changed + +- Export cache cleaning moved to a separate cron job + () From e01e48342939627a1f9ed9a950bb5730d323340d Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 24 Dec 2024 17:58:53 +0100 Subject: [PATCH 23/61] Refactor a bit --- cvat/apps/dataset_manager/util.py | 29 +++++++++++++++++------------ cvat/apps/engine/cron.py | 5 ++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 0a530d293183..1ab3b5625545 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -179,10 +179,15 @@ class ParsedBackupFilename(_ParsedExportFilename): class ExportCacheManager: - # store the instance timestamp in the file name to reliably get this information - # ctime / mtime do not return file creation time on linux - # mtime is used for file usage checks - BASE_FILE_NAME_TEMPLATE = "{file_type}-instance{instance_timestamp}{optional_suffix}.{file_ext}" + SPLITTER = "-" + INSTANCE_PREFIX = "instance" + FILE_NAME_TEMPLATE = SPLITTER.join([ + "{file_type}", INSTANCE_PREFIX + + # store the instance timestamp in the file name to reliably get this information + # ctime / mtime do not return file creation time on linux + # mtime is used for file usage checks + "{instance_timestamp}{optional_suffix}.{file_ext}" + ]) @classmethod def make_dataset_file_path( @@ -200,11 +205,11 @@ def make_dataset_file_path( file_type = ExportFileType.DATASET if save_images else ExportFileType.ANNOTATIONS normalized_format_name = make_file_name(to_snake_case(format_name)) - filename = cls.BASE_FILE_NAME_TEMPLATE.format_map( + filename = cls.FILE_NAME_TEMPLATE.format_map( { "file_type": file_type, "instance_timestamp": instance_timestamp, - "optional_suffix": "-" + normalized_format_name, + "optional_suffix": cls.SPLITTER + normalized_format_name, "file_ext": file_ext, } ) @@ -218,7 +223,7 @@ def make_backup_file_path( *, instance_timestamp: float, ) -> str: - filename = cls.BASE_FILE_NAME_TEMPLATE.format_map( + filename = cls.FILE_NAME_TEMPLATE.format_map( { "file_type": ExportFileType.BACKUP, "instance_timestamp": instance_timestamp, @@ -228,9 +233,9 @@ def make_backup_file_path( ) return osp.join(cache_dir, filename) - @staticmethod + @classmethod def parse_file_path( - file_path: os.PathLike[str], + cls, file_path: os.PathLike[str], ) -> ParsedDatasetFilename | ParsedBackupFilename: file_path = osp.normpath(file_path) dirname, basename = osp.split(file_path) @@ -247,16 +252,16 @@ def parse_file_path( instance_type_name = instance_type_names[:-1] # handle file name - file_type, unparsed = basename.split("-", maxsplit=1) + file_type, unparsed = basename.split(cls.SPLITTER, maxsplit=1) file_type = ExportFileType(file_type) unparsed, file_ext = osp.splitext(unparsed) - unparsed = unparsed[len('instance'):] + unparsed = unparsed[len(cls.INSTANCE_PREFIX):] specific_params = {} if file_type in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): try: - instance_timestamp, format_repr = unparsed.split("-", maxsplit=1) + instance_timestamp, format_repr = unparsed.split(cls.SPLITTER, maxsplit=1) except ValueError: raise CacheFilePathParseError(f"Couldn't parse file name: '{basename}'") diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 72ea34372c98..f5050c063763 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -34,11 +34,11 @@ def clear_export_cache(file_path: str) -> None: cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - logger.info("Cache file '{}' is recently accessed".format(file_path)) + logger.debug(f"Export cache file {file_path!r} was recently accessed".format(file_path)) return os.remove(file_path) - logger.debug(f"Export cache file {file_path!r} successfully removed") + logger.debug(f"Export cache file {file_path!r} was successfully removed") def cron_export_cache_cleanup() -> None: @@ -60,7 +60,6 @@ def cron_export_cache_cleanup() -> None: continue for child in export_cache_dir_path.iterdir(): - # TODO: write into a file about each file that should be removed manually or separately # export cache dir may contain temporary directories if not child.is_file(): logger.debug( From f316a90cc666ffc026182c42474803378cff9f49 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 26 Dec 2024 13:48:32 +0100 Subject: [PATCH 24/61] Switch to using a common export cache dir --- cvat/apps/dataset_manager/util.py | 43 +++--- cvat/apps/dataset_manager/views.py | 10 +- cvat/apps/engine/backup.py | 10 +- cvat/apps/engine/cron.py | 122 +++++++++++++----- cvat/apps/engine/default_settings.py | 2 - ..._date_project_last_export_date_and_more.py | 28 ---- cvat/apps/engine/models.py | 25 +--- cvat/settings/base.py | 27 ++-- 8 files changed, 143 insertions(+), 124 deletions(-) delete mode 100644 cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 1ab3b5625545..b17dd2f2be58 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -159,12 +159,18 @@ class ExportFileType(str, Enum): BACKUP = "backup" DATASET = "dataset" +class InstanceType(str, Enum): + PROJECT = "project" + TASK = "task" + JOB = "job" + @attrs.frozen class _ParsedExportFilename: file_type: ExportFileType file_ext: str instance_type: str + instance_id: int instance_timestamp: float @@ -182,7 +188,7 @@ class ExportCacheManager: SPLITTER = "-" INSTANCE_PREFIX = "instance" FILE_NAME_TEMPLATE = SPLITTER.join([ - "{file_type}", INSTANCE_PREFIX + + "{instance_type}", "{instance_id}", "{file_type}", INSTANCE_PREFIX + # store the instance timestamp in the file name to reliably get this information # ctime / mtime do not return file creation time on linux # mtime is used for file usage checks @@ -194,19 +200,24 @@ def make_dataset_file_path( cls, cache_dir: str, *, - save_images: bool, + instance_type: str, + instance_id: int, instance_timestamp: float, + save_images: bool, format_name: str, ) -> str: from .formats.registry import EXPORT_FORMATS file_ext = EXPORT_FORMATS[format_name].EXT + instance_type = InstanceType(instance_type.lower()) file_type = ExportFileType.DATASET if save_images else ExportFileType.ANNOTATIONS normalized_format_name = make_file_name(to_snake_case(format_name)) filename = cls.FILE_NAME_TEMPLATE.format_map( { + "instance_type": instance_type, + "instance_id": instance_id, "file_type": file_type, "instance_timestamp": instance_timestamp, "optional_suffix": cls.SPLITTER + normalized_format_name, @@ -221,10 +232,15 @@ def make_backup_file_path( cls, cache_dir: str, *, + instance_type: str, + instance_id: int, instance_timestamp: float, ) -> str: + instance_type = InstanceType(instance_type.lower()) filename = cls.FILE_NAME_TEMPLATE.format_map( { + "instance_type": instance_type, + "instance_id": instance_id, "file_type": ExportFileType.BACKUP, "instance_timestamp": instance_timestamp, "optional_suffix": "", @@ -238,21 +254,17 @@ def parse_file_path( cls, file_path: os.PathLike[str], ) -> ParsedDatasetFilename | ParsedBackupFilename: file_path = osp.normpath(file_path) - dirname, basename = osp.split(file_path) + basename = osp.split(file_path)[1] - # handle directory - dirname_match = re.search( - rf"/(jobs|tasks|projects)/\d+/{settings.EXPORT_CACHE_DIR_NAME}$", dirname - ) - if not dirname_match: - raise CacheFilePathParseError(f"Couldn't parse instance type in '{dirname}'") - - instance_type_names = dirname_match.group(1) - assert instance_type_names in {"projects", "tasks", "jobs"} - instance_type_name = instance_type_names[:-1] # handle file name - file_type, unparsed = basename.split(cls.SPLITTER, maxsplit=1) + instance_type, unparsed = basename.split(cls.SPLITTER, maxsplit=1) + instance_type = InstanceType(instance_type) + + instance_id, unparsed = basename.split(cls.SPLITTER, maxsplit=1) + instance_id = int(instance_id) + + file_type, unparsed = unparsed.split(cls.SPLITTER, maxsplit=1) file_type = ExportFileType(file_type) unparsed, file_ext = osp.splitext(unparsed) @@ -281,7 +293,8 @@ def parse_file_path( return ParsedFileNameClass( file_type=file_type.value, file_ext=file_ext, - instance_type=instance_type_name, + instance_id=instance_id, + instance_type=instance_type.value, instance_timestamp=instance_timestamp, **specific_params, ) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 1c6a56dff8cb..6b770596edc2 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -131,10 +131,8 @@ def export( export_fn = task.export_job db_instance = Job.objects.get(pk=job_id) - db_instance.touch_last_export_date() - cache_ttl = get_export_cache_ttl(db_instance) - cache_dir = db_instance.get_export_cache_directory(create=True) + cache_dir = settings.EXPORT_CACHE_ROOT # As we're not locking the db object here, it can be updated by the time of actual export. # The file will be saved with the older timestamp. @@ -149,7 +147,11 @@ def export( instance_update_time = max(tasks_update + [instance_update_time]) output_path = ExportCacheManager.make_dataset_file_path( - cache_dir, save_images=save_images, instance_timestamp=instance_update_time.timestamp(), + cache_dir, + instance_id=db_instance.id, + instance_type=db_instance.__class__.__name__, + instance_timestamp=instance_update_time.timestamp(), + save_images=save_images, format_name=dst_format ) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 3fc5896bc027..82cbbcf4fac0 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1035,12 +1035,16 @@ def create_backup( cache_ttl: timedelta, ): try: - cache_dir = db_instance.get_export_cache_directory(create=True) - db_instance.touch_last_export_date() + cache_dir = settings.EXPORT_CACHE_ROOT db_instance.refresh_from_db(fields=['updated_date']) instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() - output_path = ExportCacheManager.make_backup_file_path(cache_dir, instance_timestamp=instance_timestamp) + output_path = ExportCacheManager.make_backup_file_path( + cache_dir, + instance_id=db_instance.id, + instance_type=db_instance.__class__.__name__, + instance_timestamp=instance_timestamp + ) with get_export_cache_lock( output_path, diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index f5050c063763..1aa6fd2b1b86 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -2,13 +2,19 @@ # # SPDX-License-Identifier: MIT +from __future__ import annotations + import os import os.path as osp from datetime import timedelta from pathlib import Path +from threading import Event, Thread +from time import sleep +from typing import Callable -from django.db.models import QuerySet +from django.conf import settings from django.utils import timezone +from rq import get_current_job from cvat.apps.dataset_manager.util import ExportCacheManager, get_export_cache_lock from cvat.apps.dataset_manager.views import ( @@ -18,7 +24,6 @@ log_exception, ) from cvat.apps.engine.log import ServerLogManager -from cvat.apps.engine.models import Job, Project, Task logger = ServerLogManager(__name__).glob @@ -34,46 +39,93 @@ def clear_export_cache(file_path: str) -> None: cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - logger.debug(f"Export cache file {file_path!r} was recently accessed".format(file_path)) + logger.debug(f"Export cache file {file_path!r} was recently accessed") return os.remove(file_path) logger.debug(f"Export cache file {file_path!r} was successfully removed") -def cron_export_cache_cleanup() -> None: - for Model in (Project, Task, Job): - started_at = timezone.now() - one_month_ago = timezone.now() - timedelta(days=30) - queryset: QuerySet[Project | Task | Job] = Model.objects.filter( - last_export_date__gte=one_month_ago - ) - - for instance in queryset.iterator(): - instance_dir_path = Path(instance.get_dirname()) - export_cache_dir_path = Path(instance.get_export_cache_directory()) - - if not export_cache_dir_path.exists(): +class CleanupExportCacheThread(Thread): + def __init__(self, stop_event: Event, *args, **kwargs) -> None: + self._stop_event = stop_event + self._removed_files_count = 0 + self._exception_occurred = None + super().__init__(*args, **kwargs, target=self._cleanup_export_cache) + + @property + def removed_files_count(self) -> int: + return self._removed_files_count + + @property + def exception_occurred(self) -> Exception | None: + return self._exception_occurred + + def suppress_exceptions(method: Callable): + def wrapper(self: CleanupExportCacheThread): + try: + method(self) + except Exception as ex: + self._exception_occurred = ex + + return wrapper + + @suppress_exceptions + def _cleanup_export_cache(self) -> None: + # raise Exception("Ooops") + export_cache_dir_path = Path(settings.EXPORT_CACHE_ROOT) + assert export_cache_dir_path.exists() + + # TODO: use scandir + for child in export_cache_dir_path.iterdir(): + # stop clean up process correctly before rq job timeout is ended + if self._stop_event.is_set(): + return + + # export cache directory may contain temporary directories + if not child.is_file(): logger.debug( - f"{export_cache_dir_path.relative_to(instance_dir_path)} path does not exist, skipping..." + f"The {child.relative_to(export_cache_dir_path)} is not a file, skipping..." ) continue - for child in export_cache_dir_path.iterdir(): - # export cache dir may contain temporary directories - if not child.is_file(): - logger.debug( - f"The {child.relative_to(instance_dir_path)} is not a file, skipping..." - ) - continue - - try: - clear_export_cache(child) - except Exception: - log_exception(logger) - - finished_at = timezone.now() - logger.info( - f"Clearing the {Model.__class__.__name__.lower()} export cache has been successfully " - f"completed after {int((finished_at - started_at).total_seconds())} seconds." - ) + try: + clear_export_cache(child) + self._removed_files_count += 1 + except Exception: + log_exception(logger) + + +def cron_export_cache_cleanup() -> None: + started_at = timezone.now() + rq_job = get_current_job() + seconds_left = rq_job.timeout - 60 + sleep_interval = 30 + assert seconds_left > sleep_interval + 10 # TODO: + finish_before = started_at + timedelta(seconds=seconds_left) + + stop_event = Event() + cleanup_export_cache_thread = CleanupExportCacheThread(stop_event=stop_event) + cleanup_export_cache_thread.start() + + while timezone.now() < finish_before: + if not cleanup_export_cache_thread.is_alive(): + stop_event.set() + break + sleep(sleep_interval) + + if not stop_event.is_set(): + stop_event.set() + + cleanup_export_cache_thread.join() + if exception_occurred := cleanup_export_cache_thread.exception_occurred: + raise exception_occurred + + removed_files_count = cleanup_export_cache_thread.removed_files_count + + finished_at = timezone.now() + logger.info( + f"Export cache cleanup has been successfully " + f"completed after {int((finished_at - started_at).total_seconds())} seconds. " + f"{removed_files_count} files have been removed" + ) diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 9b92211031eb..f853d3bc8219 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -91,5 +91,3 @@ EXPORT_LOCKED_RETRY_INTERVAL = int( os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", default_export_locked_retry_interval) ) - -EXPORT_CACHE_DIR_NAME = "export_cache" diff --git a/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py b/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py deleted file mode 100644 index 9468ae3f2768..000000000000 --- a/cvat/apps/engine/migrations/0087_job_last_export_date_project_last_export_date_and_more.py +++ /dev/null @@ -1,28 +0,0 @@ -# Generated by Django 4.2.15 on 2024-12-09 16:51 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("engine", "0086_profile_has_analytics_access"), - ] - - operations = [ - migrations.AddField( - model_name="job", - name="last_export_date", - field=models.DateTimeField(null=True), - ), - migrations.AddField( - model_name="project", - name="last_export_date", - field=models.DateTimeField(null=True), - ), - migrations.AddField( - model_name="task", - name="last_export_date", - field=models.DateTimeField(null=True), - ), - ] diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 409a5b8d21e9..39b3671d71ce 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -447,25 +447,6 @@ def get_dirname(self) -> str: def get_tmp_dirname(self) -> str: return os.path.join(self.get_dirname(), "tmp") - def get_export_cache_directory(self, create: bool = False) -> str: - base_dir = os.path.abspath(self.get_dirname()) - cache_dir = os.path.join(base_dir, settings.EXPORT_CACHE_DIR_NAME) - - if create: - os.makedirs(cache_dir, exist_ok=True) - - return cache_dir - - -class _Exportable(models.Model): - class Meta: - abstract = True - - last_export_date = models.DateTimeField(null=True) - - def touch_last_export_date(self): - self.last_export_date = timezone.now() - self.save(update_fields=["last_export_date"]) @transaction.atomic(savepoint=False) def clear_annotations_in_jobs(job_ids): @@ -504,7 +485,7 @@ def clear_annotations_on_frames_in_honeypot_task(db_task: Task, frames: Sequence frame__in=frames_batch, ).delete() -class Project(TimestampedModel, _FileSystemRelatedModel, _Exportable): +class Project(TimestampedModel, _FileSystemRelatedModel): name = SafeCharField(max_length=256) owner = models.ForeignKey(User, null=True, blank=True, on_delete=models.SET_NULL, related_name="+") @@ -579,7 +560,7 @@ def with_job_summary(self): ) ) -class Task(TimestampedModel, _FileSystemRelatedModel, _Exportable): +class Task(TimestampedModel, _FileSystemRelatedModel): objects = TaskQuerySet.as_manager() project = models.ForeignKey(Project, on_delete=models.CASCADE, @@ -870,7 +851,7 @@ def _validate_constraints(self, obj: dict[str, Any]): -class Job(TimestampedModel, _FileSystemRelatedModel, _Exportable): +class Job(TimestampedModel, _FileSystemRelatedModel): objects = JobQuerySet.as_manager() segment = models.ForeignKey(Segment, on_delete=models.CASCADE) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 08bd5c6d666d..79abe2eb09e4 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -320,7 +320,7 @@ class CVAT_QUEUES(Enum): }, CVAT_QUEUES.CLEANING.value: { **shared_queue_settings, - 'DEFAULT_TIMEOUT': '1h', + 'DEFAULT_TIMEOUT': '2h', }, CVAT_QUEUES.CHUNKS.value: { **shared_queue_settings, @@ -353,20 +353,14 @@ class CVAT_QUEUES(Enum): 'func': 'cvat.apps.iam.utils.clean_up_sessions', 'cron_string': '0 0 * * *', }, - *( - { - 'queue': CVAT_QUEUES.CLEANING.value, - 'id': f'cron_{model.lower()}_export_cache_cleanup', - 'func': 'cvat.apps.engine.cron.cron_export_cache_cleanup', - # Run once a day at midnight - 'cron_string': cron_string, - 'args': (f'cvat.apps.engine.models.{model.title()}',), - } - for model, cron_string in zip( - ('project', 'task', 'job'), - ('0 0 * * *', '0 6 * * *', '0 12 * * *') - ) - ), + { + 'queue': CVAT_QUEUES.CLEANING.value, + 'id': f'cron_export_cache_cleanup', + 'func': 'cvat.apps.engine.cron.cron_export_cache_cleanup', + # Run twice a day (at midnight and at noon) + # 'cron_string': '0 0,12 * * *', + 'cron_string': '30 11 * * *', + } ] # JavaScript and CSS compression @@ -427,6 +421,9 @@ class CVAT_QUEUES(Enum): CACHE_ROOT = os.path.join(DATA_ROOT, 'cache') os.makedirs(CACHE_ROOT, exist_ok=True) +EXPORT_CACHE_ROOT = os.path.join(CACHE_ROOT, 'export') +os.makedirs(EXPORT_CACHE_ROOT, exist_ok=True) + EVENTS_LOCAL_DB_ROOT = os.path.join(CACHE_ROOT, 'events') os.makedirs(EVENTS_LOCAL_DB_ROOT, exist_ok=True) EVENTS_LOCAL_DB_FILE = os.path.join( From 18a832e7bab8f59634523d7e6273f410c39d6231 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 11:02:24 +0100 Subject: [PATCH 25/61] Fix pylint issues && use os.scandir --- cvat/apps/dataset_manager/util.py | 1 - cvat/apps/engine/cron.py | 48 ++++++++++++++++--------------- cvat/apps/engine/models.py | 1 - cvat/settings/base.py | 5 ++-- 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index b17dd2f2be58..39ecdcfcbdf5 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -6,7 +6,6 @@ import inspect import os import os.path as osp -import re import zipfile from collections.abc import Generator, Sequence from contextlib import contextmanager diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 1aa6fd2b1b86..233ecd61b522 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -7,7 +7,7 @@ import os import os.path as osp from datetime import timedelta -from pathlib import Path +from functools import wraps from threading import Event, Thread from time import sleep from typing import Callable @@ -28,6 +28,17 @@ logger = ServerLogManager(__name__).glob +def suppress_exceptions(func: Callable[[CleanupExportCacheThread], None]): + @wraps(func) + def wrapper(self: CleanupExportCacheThread): + try: + func(self) + except Exception as ex: + self.set_exception(ex) + + return wrapper + + def clear_export_cache(file_path: str) -> None: with get_export_cache_lock( file_path, @@ -61,32 +72,23 @@ def removed_files_count(self) -> int: def exception_occurred(self) -> Exception | None: return self._exception_occurred - def suppress_exceptions(method: Callable): - def wrapper(self: CleanupExportCacheThread): - try: - method(self) - except Exception as ex: - self._exception_occurred = ex - - return wrapper + def set_exception(self, ex: Exception) -> None: + assert isinstance(ex, Exception) + self._exception_occurred = ex @suppress_exceptions def _cleanup_export_cache(self) -> None: - # raise Exception("Ooops") - export_cache_dir_path = Path(settings.EXPORT_CACHE_ROOT) - assert export_cache_dir_path.exists() + export_cache_dir_path = settings.EXPORT_CACHE_ROOT + assert os.path.exists(export_cache_dir_path) - # TODO: use scandir - for child in export_cache_dir_path.iterdir(): + for child in os.scandir(export_cache_dir_path): # stop clean up process correctly before rq job timeout is ended if self._stop_event.is_set(): return # export cache directory may contain temporary directories if not child.is_file(): - logger.debug( - f"The {child.relative_to(export_cache_dir_path)} is not a file, skipping..." - ) + logger.debug(f"The {child.name} is not a file, skipping...") continue try: @@ -100,8 +102,8 @@ def cron_export_cache_cleanup() -> None: started_at = timezone.now() rq_job = get_current_job() seconds_left = rq_job.timeout - 60 - sleep_interval = 30 - assert seconds_left > sleep_interval + 10 # TODO: + sleep_interval = 10 + assert seconds_left > sleep_interval finish_before = started_at + timedelta(seconds=seconds_left) stop_event = Event() @@ -118,14 +120,14 @@ def cron_export_cache_cleanup() -> None: stop_event.set() cleanup_export_cache_thread.join() - if exception_occurred := cleanup_export_cache_thread.exception_occurred: + if isinstance( + (exception_occurred := cleanup_export_cache_thread.exception_occurred), Exception + ): raise exception_occurred - removed_files_count = cleanup_export_cache_thread.removed_files_count - finished_at = timezone.now() logger.info( f"Export cache cleanup has been successfully " f"completed after {int((finished_at - started_at).total_seconds())} seconds. " - f"{removed_files_count} files have been removed" + f"{cleanup_export_cache_thread.removed_files_count} files have been removed" ) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 39b3671d71ce..ef6cc3f6f6a9 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -25,7 +25,6 @@ from django.db.models.fields import FloatField from django.db.models.base import ModelBase from django.utils.translation import gettext_lazy as _ -from django.utils import timezone from drf_spectacular.types import OpenApiTypes from drf_spectacular.utils import extend_schema_field diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 79abe2eb09e4..57445ba9e4e2 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -355,11 +355,10 @@ class CVAT_QUEUES(Enum): }, { 'queue': CVAT_QUEUES.CLEANING.value, - 'id': f'cron_export_cache_cleanup', + 'id': 'cron_export_cache_cleanup', 'func': 'cvat.apps.engine.cron.cron_export_cache_cleanup', # Run twice a day (at midnight and at noon) - # 'cron_string': '0 0,12 * * *', - 'cron_string': '30 11 * * *', + 'cron_string': '0 0,12 * * *', } ] From 5ee90820b3f750e9ce3b06816b1020f3c721f05d Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 13:54:38 +0100 Subject: [PATCH 26/61] Return re usage --- cvat/apps/dataset_manager/util.py | 49 +++++++++++++++++++------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 39ecdcfcbdf5..013f6e3abfd4 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -7,6 +7,7 @@ import os import os.path as osp import zipfile +import re from collections.abc import Generator, Sequence from contextlib import contextmanager from copy import deepcopy @@ -158,11 +159,18 @@ class ExportFileType(str, Enum): BACKUP = "backup" DATASET = "dataset" + @classmethod + def values(cls) -> list[str]: + return list(map(lambda x: x.value, cls)) + class InstanceType(str, Enum): PROJECT = "project" TASK = "task" JOB = "job" + @classmethod + def values(cls) -> list[str]: + return list(map(lambda x: x.value, cls)) @attrs.frozen class _ParsedExportFilename: @@ -207,7 +215,7 @@ def make_dataset_file_path( ) -> str: from .formats.registry import EXPORT_FORMATS - file_ext = EXPORT_FORMATS[format_name].EXT + file_ext = (EXPORT_FORMATS[format_name].EXT).lower() instance_type = InstanceType(instance_type.lower()) file_type = ExportFileType.DATASET if save_images else ExportFileType.ANNOTATIONS @@ -254,47 +262,50 @@ def parse_file_path( ) -> ParsedDatasetFilename | ParsedBackupFilename: file_path = osp.normpath(file_path) basename = osp.split(file_path)[1] - + basename, file_ext = osp.splitext(basename) + file_ext = file_ext.strip(".").lower() # handle file name - instance_type, unparsed = basename.split(cls.SPLITTER, maxsplit=1) - instance_type = InstanceType(instance_type) + basename_match = re.fullmatch( + ( + rf"^(?P{'|'.join(InstanceType.values())})" + rf"{cls.SPLITTER}(?P\d+)" + rf"{cls.SPLITTER}(?P{'|'.join(ExportFileType.values())})" + rf"{cls.SPLITTER}(?P.+)$" + ), + basename, + ) - instance_id, unparsed = basename.split(cls.SPLITTER, maxsplit=1) - instance_id = int(instance_id) + if not basename_match: + raise CacheFilePathParseError(f"Couldn't parse file name: {basename!r}") - file_type, unparsed = unparsed.split(cls.SPLITTER, maxsplit=1) - file_type = ExportFileType(file_type) + fragments = basename_match.groupdict() + fragments["instance_id"] = int(fragments["instance_id"]) - unparsed, file_ext = osp.splitext(unparsed) - unparsed = unparsed[len(cls.INSTANCE_PREFIX):] + unparsed = fragments.pop("unparsed")[len(cls.INSTANCE_PREFIX):] specific_params = {} - if file_type in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): + if fragments["file_type"] in (ExportFileType.DATASET, ExportFileType.ANNOTATIONS): try: instance_timestamp, format_repr = unparsed.split(cls.SPLITTER, maxsplit=1) except ValueError: - raise CacheFilePathParseError(f"Couldn't parse file name: '{basename}'") + raise CacheFilePathParseError(f"Couldn't parse file name: {basename!r}") specific_params["format_repr"] = format_repr ParsedFileNameClass = ParsedDatasetFilename - elif file_type == ExportFileType.BACKUP: + else: instance_timestamp = unparsed ParsedFileNameClass = ParsedBackupFilename - else: - raise CacheFilePathParseError(f"Unsupported file type: {file_type!r}") try: instance_timestamp = float(instance_timestamp) except ValueError: - raise CacheFilePathParseError(f"Couldn't parse instance timestamp: '{instance_timestamp}'") + raise CacheFilePathParseError(f"Couldn't parse instance timestamp: {instance_timestamp!r}") return ParsedFileNameClass( - file_type=file_type.value, file_ext=file_ext, - instance_id=instance_id, - instance_type=instance_type.value, instance_timestamp=instance_timestamp, + **fragments, **specific_params, ) From 17170136e5f064b98ab32e38ac58ab0b2c04920f Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 13:55:06 +0100 Subject: [PATCH 27/61] Update tests --- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 5 +++++ cvat/apps/engine/tests/test_rest_api.py | 5 +++++ cvat/apps/engine/tests/utils.py | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index a59289cafa1b..ebd06a28677d 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -2093,7 +2093,12 @@ def _get_project_task_job_ids(): "cvat.apps.engine.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, + patch( + "cvat.apps.engine.cron.get_current_job", + ) as mock_rq_get_current_job, ): + mock_rq_job = MagicMock(timeout=100) + mock_rq_get_current_job.return_value = mock_rq_job cron_export_cache_cleanup() mock_clear_export_cache.assert_called_once() diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index e47fff0b61a2..314f7c262f72 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3104,7 +3104,12 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): "cvat.apps.engine.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, + mock.patch( + "cvat.apps.engine.cron.get_current_job", + ) as mock_rq_get_current_job, ): + mock_rq_job = mock.MagicMock(timeout=100) + mock_rq_get_current_job.return_value = mock_rq_job cron_export_cache_cleanup() mock_clear_export_cache.assert_not_called() diff --git a/cvat/apps/engine/tests/utils.py b/cvat/apps/engine/tests/utils.py index 910323cac1f7..189b831fe051 100644 --- a/cvat/apps/engine/tests/utils.py +++ b/cvat/apps/engine/tests/utils.py @@ -14,6 +14,7 @@ from django.core.cache import caches from django.http.response import HttpResponse from PIL import Image +from pathlib import Path from rest_framework.test import APITestCase import av import django_rq @@ -106,6 +107,11 @@ def _clear_temp_data(self): # Clear any remaining RQ jobs produced by the tests executed self._clear_rq_jobs() + # clear cache files created after previous exports + export_cache_dir = Path(settings.EXPORT_CACHE_ROOT) + for child in export_cache_dir.iterdir(): + os.remove(child) + def _clear_rq_jobs(self): clear_rq_jobs() From 4760fe78cab1f98505f0c49017ffa4ef5d754840 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 15:02:17 +0100 Subject: [PATCH 28/61] fix --- cvat/apps/dataset_manager/project.py | 4 ++-- cvat/apps/dataset_manager/task.py | 4 ---- cvat/apps/dataset_manager/util.py | 6 ++---- cvat/apps/dataset_manager/views.py | 8 +++----- cvat/apps/engine/backup.py | 5 ++--- cvat/apps/engine/models.py | 9 +++++++-- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index 93ac651cf477..165a8183a558 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -139,7 +139,7 @@ def export(self, dst_file: str, exporter: Callable, host: str='', **options): ) temp_dir_base = self.db_project.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) + with TemporaryDirectory(dir=temp_dir_base) as temp_dir: exporter(dst_file, temp_dir, project_data, **options) @@ -156,7 +156,7 @@ def import_dataset(self, dataset_file, importer, **options): project_data.soft_attribute_import = True temp_dir_base = self.db_project.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) + with TemporaryDirectory(dir=temp_dir_base) as temp_dir: try: importer(dataset_file, temp_dir, project_data, load_data_callback=self.load_dataset_data, **options) diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index 83886d7e9cf1..a739b366425b 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -769,7 +769,6 @@ def export(self, dst_file, exporter, host='', **options): ) temp_dir_base = self.db_job.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) with TemporaryDirectory(dir=temp_dir_base) as temp_dir: exporter(dst_file, temp_dir, job_data, **options) @@ -782,7 +781,6 @@ def import_annotations(self, src_file, importer, **options): self.delete() temp_dir_base = self.db_job.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) with TemporaryDirectory(dir=temp_dir_base) as temp_dir: try: importer(src_file, temp_dir, job_data, **options) @@ -976,7 +974,6 @@ def export(self, dst_file, exporter, host='', **options): ) temp_dir_base = self.db_task.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) with TemporaryDirectory(dir=temp_dir_base) as temp_dir: exporter(dst_file, temp_dir, task_data, **options) @@ -989,7 +986,6 @@ def import_annotations(self, src_file, importer, **options): self.delete() temp_dir_base = self.db_task.get_tmp_dirname() - os.makedirs(temp_dir_base, exist_ok=True) with TemporaryDirectory(dir=temp_dir_base) as temp_dir: try: importer(src_file, temp_dir, task_data, **options) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 013f6e3abfd4..095e210102f3 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -205,7 +205,6 @@ class ExportCacheManager: @classmethod def make_dataset_file_path( cls, - cache_dir: str, *, instance_type: str, instance_id: int, @@ -232,12 +231,11 @@ def make_dataset_file_path( } ) - return osp.join(cache_dir, filename) + return osp.join(settings.EXPORT_CACHE_ROOT, filename) @classmethod def make_backup_file_path( cls, - cache_dir: str, *, instance_type: str, instance_id: int, @@ -254,7 +252,7 @@ def make_backup_file_path( "file_ext": "zip", } ) - return osp.join(cache_dir, filename) + return osp.join(settings.EXPORT_CACHE_ROOT, filename) @classmethod def parse_file_path( diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 6b770596edc2..ffaafb5e14af 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -132,7 +132,6 @@ def export( db_instance = Job.objects.get(pk=job_id) cache_ttl = get_export_cache_ttl(db_instance) - cache_dir = settings.EXPORT_CACHE_ROOT # As we're not locking the db object here, it can be updated by the time of actual export. # The file will be saved with the older timestamp. @@ -147,7 +146,6 @@ def export( instance_update_time = max(tasks_update + [instance_update_time]) output_path = ExportCacheManager.make_dataset_file_path( - cache_dir, instance_id=db_instance.id, instance_type=db_instance.__class__.__name__, instance_timestamp=instance_update_time.timestamp(), @@ -155,8 +153,6 @@ def export( format_name=dst_format ) - os.makedirs(cache_dir, exist_ok=True) - # acquire a lock 2 times instead of using one long lock: # 1. to check whether the file exists or not # 2. to create a file when it doesn't exist @@ -169,7 +165,9 @@ def export( extend_export_file_lifetime(output_path) return output_path - with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: + tmp_dir = db_instance.get_tmp_dirname() + + with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_dir: temp_file = osp.join(temp_dir, 'result') export_fn(db_instance.id, temp_file, dst_format, server_url=server_url, save_images=save_images) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 82cbbcf4fac0..339a396f59ab 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1035,12 +1035,11 @@ def create_backup( cache_ttl: timedelta, ): try: - cache_dir = settings.EXPORT_CACHE_ROOT + tmp_dir = db_instance.get_tmp_dirname() db_instance.refresh_from_db(fields=['updated_date']) instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() output_path = ExportCacheManager.make_backup_file_path( - cache_dir, instance_id=db_instance.id, instance_type=db_instance.__class__.__name__, instance_timestamp=instance_timestamp @@ -1057,7 +1056,7 @@ def create_backup( extend_export_file_lifetime(output_path) return output_path - with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: + with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_dir: temp_file = os.path.join(temp_dir, 'dump') exporter = Exporter(db_instance.id) exporter.export_to(temp_file) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index ef6cc3f6f6a9..6c78dde6ea2b 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -443,8 +443,13 @@ class Meta: def get_dirname(self) -> str: ... - def get_tmp_dirname(self) -> str: - return os.path.join(self.get_dirname(), "tmp") + def get_tmp_dirname(self, create: bool = True) -> str: + dir_path = os.path.join(self.get_dirname(), "tmp") + + if create: + os.makedirs(dir_path, exist_ok=True) + + return dir_path @transaction.atomic(savepoint=False) From 3616a2568a8629f5d40b2e4376001783f422634e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 15:56:47 +0100 Subject: [PATCH 29/61] [doc] draft migration --- .../administration/advanced/upgrade_guide.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 3462b20d28a5..78f7cd2aeaca 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -56,6 +56,31 @@ To upgrade CVAT, follow these steps: docker logs cvat_server -f ``` +## Upgrade CVAT after 2.24.0 +TODO: +```python +import shutil +from datetime import datetime +from pathlib import Path + +from tqdm import tqdm + +from cvat.apps.engine.models import Job, Project, Task + +migration_date = datetime.now() # TODO: release date + +for Model in (Project, Task, Job): + print(f"Deleting the export cache for {Model.__name__.lower()}s...") + queryset = Model.objects.filter(created_date__lt=migration_date) + objects_count = queryset.count() + print(f"The {objects_count} folders are going to be checked") + + for obj in tqdm(queryset.iterator(), total=objects_count): + export_cache_dir = Path(obj.get_dirname()) / "export_cache" + if export_cache_dir.exists(): + shutil.rmtree(export_cache_dir) +``` + ## How to upgrade CVAT from v2.2.0 to v2.3.0. Step by step commands how to upgrade CVAT from v2.2.0 to v2.3.0. From 209e8fdee2f29b254460334d52b5c7c76e282872 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 27 Dec 2024 15:57:52 +0100 Subject: [PATCH 30/61] pylint --- cvat/apps/dataset_manager/project.py | 1 - cvat/apps/dataset_manager/task.py | 1 - 2 files changed, 2 deletions(-) diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index 165a8183a558..3b57b5817635 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -3,7 +3,6 @@ # # SPDX-License-Identifier: MIT -import os from collections.abc import Mapping from tempfile import TemporaryDirectory import rq diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index a739b366425b..92bb2fc42680 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT import itertools -import os from collections import OrderedDict from copy import deepcopy from enum import Enum From 481540a71b8cf6738b55978566236c2336ccc8f3 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 30 Dec 2024 13:21:35 +0100 Subject: [PATCH 31/61] Update documentation --- cvat/apps/dataset_manager/util.py | 2 +- cvat/apps/engine/cron.py | 12 ++--- .../administration/advanced/upgrade_guide.md | 51 +++++++++++++------ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 095e210102f3..fca03b5185f3 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -6,8 +6,8 @@ import inspect import os import os.path as osp -import zipfile import re +import zipfile from collections.abc import Generator, Sequence from contextlib import contextmanager from copy import deepcopy diff --git a/cvat/apps/engine/cron.py b/cvat/apps/engine/cron.py index 233ecd61b522..bfab89685643 100644 --- a/cvat/apps/engine/cron.py +++ b/cvat/apps/engine/cron.py @@ -61,7 +61,7 @@ class CleanupExportCacheThread(Thread): def __init__(self, stop_event: Event, *args, **kwargs) -> None: self._stop_event = stop_event self._removed_files_count = 0 - self._exception_occurred = None + self._exception = None super().__init__(*args, **kwargs, target=self._cleanup_export_cache) @property @@ -69,12 +69,12 @@ def removed_files_count(self) -> int: return self._removed_files_count @property - def exception_occurred(self) -> Exception | None: - return self._exception_occurred + def exception(self) -> Exception | None: + return self._exception def set_exception(self, ex: Exception) -> None: assert isinstance(ex, Exception) - self._exception_occurred = ex + self._exception = ex @suppress_exceptions def _cleanup_export_cache(self) -> None: @@ -121,9 +121,9 @@ def cron_export_cache_cleanup() -> None: cleanup_export_cache_thread.join() if isinstance( - (exception_occurred := cleanup_export_cache_thread.exception_occurred), Exception + (exception := cleanup_export_cache_thread.exception), Exception ): - raise exception_occurred + raise exception finished_at = timezone.now() logger.info( diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 78f7cd2aeaca..4d23a8a6fa7e 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -56,31 +56,52 @@ To upgrade CVAT, follow these steps: docker logs cvat_server -f ``` -## Upgrade CVAT after 2.24.0 -TODO: +## Upgrade CVAT after v2.25.0 + +In version 2.25.0, CVAT changed the location where the export cache is stored. +The following Python script can be used to remove outdated files from the previous location: + ```python import shutil -from datetime import datetime from pathlib import Path - +from django.utils import timezone from tqdm import tqdm - from cvat.apps.engine.models import Job, Project, Task -migration_date = datetime.now() # TODO: release date -for Model in (Project, Task, Job): - print(f"Deleting the export cache for {Model.__name__.lower()}s...") - queryset = Model.objects.filter(created_date__lt=migration_date) - objects_count = queryset.count() - print(f"The {objects_count} folders are going to be checked") +def cleanup_outdated_cache(): + now = timezone.now() + + for Model in (Project, Task, Job): + print(f"Deleting the export cache for {Model.__name__.lower()}s...") + queryset = Model.objects.filter(created_date__lt=now) + objects_count = queryset.count() + if objects_count < 1: + continue + + print(f"The {objects_count} folder{'s' if objects_count > 1 else ''} are going to be checked") + + for obj in tqdm(queryset.iterator(), total=objects_count): + export_cache_dir = Path(obj.get_dirname()) / "export_cache" + if export_cache_dir.exists(): + shutil.rmtree(export_cache_dir) + + +if __name__ == "__main__": + cleanup_outdated_cache() - for obj in tqdm(queryset.iterator(), total=objects_count): - export_cache_dir = Path(obj.get_dirname()) / "export_cache" - if export_cache_dir.exists(): - shutil.rmtree(export_cache_dir) ``` +### How to run the script + +1. Save the script as `cleanup_script.py` in a directory where `manage.py` is located +1. Run Django shell command: `python manage.py shell` +1. Import and execute the script: + ```python + from cleanup_script import cleanup_outdated_cache + cleanup_outdated_cache() + ``` + ## How to upgrade CVAT from v2.2.0 to v2.3.0. Step by step commands how to upgrade CVAT from v2.2.0 to v2.3.0. From abbf3b51bc600451b8489d7badd068d792fd7744 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:23:13 +0100 Subject: [PATCH 32/61] Move cron.py into dataset_manager app --- cvat/apps/{engine => dataset_manager}/cron.py | 0 .../tests/test_rest_api_formats.py | 22 +++++++++---------- cvat/apps/engine/tests/test_rest_api.py | 6 ++--- cvat/settings/base.py | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) rename cvat/apps/{engine => dataset_manager}/cron.py (100%) diff --git a/cvat/apps/engine/cron.py b/cvat/apps/dataset_manager/cron.py similarity index 100% rename from cvat/apps/engine/cron.py rename to cvat/apps/dataset_manager/cron.py diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index cde62d6fad82..9a64902e5606 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -38,7 +38,7 @@ from cvat.apps.dataset_manager.views import export from cvat.apps.engine.models import Task from cvat.apps.engine.tests.utils import get_paginated_collection, ApiTestBase, ForceLogin -from cvat.apps.engine.cron import clear_export_cache +from cvat.apps.dataset_manager.cron import clear_export_cache projects_path = osp.join(osp.dirname(__file__), 'assets', 'projects.json') with open(projects_path) as file: @@ -1500,10 +1500,10 @@ def _clear(*_, file_path: str): from os import remove as original_remove with ( - patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), - patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), + patch("cvat.apps.dataset_manager.cron.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), + patch("cvat.apps.dataset_manager.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), patch( - "cvat.apps.engine.cron.get_export_cache_lock", + "cvat.apps.dataset_manager.cron.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), patch( @@ -1681,12 +1681,12 @@ def _clear(*_, file_path: str): from cvat.apps.dataset_manager.util import LockNotAvailableError with ( - patch("cvat.apps.engine.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=3), + patch("cvat.apps.dataset_manager.cron.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=3), patch( - "cvat.apps.engine.cron.get_export_cache_lock", + "cvat.apps.dataset_manager.cron.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), - patch("cvat.apps.engine.cron.os.remove") as mock_os_remove, + patch("cvat.apps.dataset_manager.cron.os.remove") as mock_os_remove, patch( "cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)} ), @@ -2046,7 +2046,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): with ( patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(hours=1)}), - patch("cvat.apps.engine.cron.os.remove", side_effect=original_remove) as mock_os_remove, + patch("cvat.apps.dataset_manager.cron.os.remove", side_effect=original_remove) as mock_os_remove, ): export_path = export(dst_format=format_name, task_id=task_id) clear_export_cache(file_path=export_path) @@ -2055,7 +2055,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): self.assertTrue(osp.isfile(export_path)) def test_cleanup_cron_job_can_delete_cached_files(self): - from cvat.apps.engine.cron import cron_export_cache_cleanup + from cvat.apps.dataset_manager.cron import cron_export_cache_cleanup def _get_project_task_job_ids(): project = self._create_project(projects["main"]) @@ -2090,11 +2090,11 @@ def _get_project_task_job_ids(): new={resource: timedelta(seconds=0)}, ), patch( - "cvat.apps.engine.cron.clear_export_cache", + "cvat.apps.dataset_manager.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, patch( - "cvat.apps.engine.cron.get_current_job", + "cvat.apps.dataset_manager.cron.get_current_job", ) as mock_rq_get_current_job, ): mock_rq_job = MagicMock(timeout=100) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index bbf4071f0e1d..625943f40c16 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3091,7 +3091,7 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.engine.cron import cron_export_cache_cleanup, clear_export_cache + from cvat.apps.dataset_manager.cron import cron_export_cache_cleanup, clear_export_cache self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin @@ -3101,11 +3101,11 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=TASK_CACHE_TTL), mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': TASK_CACHE_TTL}), mock.patch( - "cvat.apps.engine.cron.clear_export_cache", + "cvat.apps.dataset_manager.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, mock.patch( - "cvat.apps.engine.cron.get_current_job", + "cvat.apps.dataset_manager.cron.get_current_job", ) as mock_rq_get_current_job, ): mock_rq_job = mock.MagicMock(timeout=100) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 57445ba9e4e2..29e037f79078 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -356,7 +356,7 @@ class CVAT_QUEUES(Enum): { 'queue': CVAT_QUEUES.CLEANING.value, 'id': 'cron_export_cache_cleanup', - 'func': 'cvat.apps.engine.cron.cron_export_cache_cleanup', + 'func': 'cvat.apps.dataset_manager.cron.cron_export_cache_cleanup', # Run twice a day (at midnight and at noon) 'cron_string': '0 0,12 * * *', } From 7461d3dad3b740c8c55d859a8c573e2484d7461a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:33:50 +0100 Subject: [PATCH 33/61] Add EXPORT_CACHE_ROOT for unit tests --- cvat/settings/testing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cvat/settings/testing.py b/cvat/settings/testing.py index 3cd47559fbd0..d7f0e993ae68 100644 --- a/cvat/settings/testing.py +++ b/cvat/settings/testing.py @@ -24,6 +24,9 @@ CACHE_ROOT = os.path.join(DATA_ROOT, 'cache') os.makedirs(CACHE_ROOT, exist_ok=True) +EXPORT_CACHE_ROOT = os.path.join(CACHE_ROOT, 'export') +os.makedirs(EXPORT_CACHE_ROOT, exist_ok=True) + JOBS_ROOT = os.path.join(DATA_ROOT, 'jobs') os.makedirs(JOBS_ROOT, exist_ok=True) From 071c493fae972a86238d0868a6966f70b2f46a97 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:35:07 +0100 Subject: [PATCH 34/61] Use kwonly arguments --- cvat/apps/engine/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 6c78dde6ea2b..0a67822681c8 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -443,7 +443,7 @@ class Meta: def get_dirname(self) -> str: ... - def get_tmp_dirname(self, create: bool = True) -> str: + def get_tmp_dirname(self, *, create: bool = True) -> str: dir_path = os.path.join(self.get_dirname(), "tmp") if create: From 3e42a3c3da9289493eec2d482e4c2a2e4a22cd6b Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:46:42 +0100 Subject: [PATCH 35/61] Refactor CleanupExportCacheThread --- cvat/apps/dataset_manager/cron.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index bfab89685643..d55207481a38 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -68,10 +68,6 @@ def __init__(self, stop_event: Event, *args, **kwargs) -> None: def removed_files_count(self) -> int: return self._removed_files_count - @property - def exception(self) -> Exception | None: - return self._exception - def set_exception(self, ex: Exception) -> None: assert isinstance(ex, Exception) self._exception = ex @@ -97,6 +93,10 @@ def _cleanup_export_cache(self) -> None: except Exception: log_exception(logger) + def raise_if_exception(self) -> None: + if isinstance(self._exception, Exception): + raise self._exception + def cron_export_cache_cleanup() -> None: started_at = timezone.now() @@ -120,10 +120,7 @@ def cron_export_cache_cleanup() -> None: stop_event.set() cleanup_export_cache_thread.join() - if isinstance( - (exception := cleanup_export_cache_thread.exception), Exception - ): - raise exception + cleanup_export_cache_thread.raise_if_exception() finished_at = timezone.now() logger.info( From c658eb67c3704f2884a031ebd8aef8c557de4f3c Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:47:07 +0100 Subject: [PATCH 36/61] Fix path in dev/format_python_code.sh --- dev/format_python_code.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index f3ae0b63c315..f7220679073c 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -27,11 +27,11 @@ for paths in \ "cvat/apps/engine/background.py" \ "cvat/apps/engine/frame_provider.py" \ "cvat/apps/engine/cache.py" \ - "cvat/apps/engine/cron.py" \ "cvat/apps/engine/default_settings.py" \ "cvat/apps/engine/field_validation.py" \ "cvat/apps/engine/model_utils.py" \ "cvat/apps/engine/task_validation.py" \ + "cvat/apps/dataset_manager/cron.py" \ "cvat/apps/dataset_manager/tests/test_annotation.py" \ "cvat/apps/dataset_manager/tests/utils.py" \ "cvat/apps/events/signals.py" \ From 6963465f3ca36a13497c5a0e037e7f302e47c6aa Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 31 Dec 2024 11:58:07 +0100 Subject: [PATCH 37/61] Clenup temp export dirs after failed unit tests --- cvat/apps/engine/tests/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cvat/apps/engine/tests/utils.py b/cvat/apps/engine/tests/utils.py index 189b831fe051..de803319e56a 100644 --- a/cvat/apps/engine/tests/utils.py +++ b/cvat/apps/engine/tests/utils.py @@ -9,6 +9,7 @@ import itertools import logging import os +import shutil from django.conf import settings from django.core.cache import caches @@ -110,7 +111,10 @@ def _clear_temp_data(self): # clear cache files created after previous exports export_cache_dir = Path(settings.EXPORT_CACHE_ROOT) for child in export_cache_dir.iterdir(): - os.remove(child) + if child.is_dir(): + shutil.rmtree(child) + else: + os.remove(child) def _clear_rq_jobs(self): clear_rq_jobs() From a82db50577537af693053ff751513db0c8869767 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 3 Jan 2025 09:48:19 +0100 Subject: [PATCH 38/61] Rename class --- cvat/apps/engine/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 0a67822681c8..37d9c143b298 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -435,7 +435,7 @@ def touch(self) -> None: class ABCModelMeta(ABCMeta, ModelBase): pass -class _FileSystemRelatedModel(models.Model, metaclass=ABCModelMeta): +class FileSystemRelatedModel(models.Model, metaclass=ABCModelMeta): class Meta: abstract = True @@ -489,7 +489,7 @@ def clear_annotations_on_frames_in_honeypot_task(db_task: Task, frames: Sequence frame__in=frames_batch, ).delete() -class Project(TimestampedModel, _FileSystemRelatedModel): +class Project(TimestampedModel, FileSystemRelatedModel): name = SafeCharField(max_length=256) owner = models.ForeignKey(User, null=True, blank=True, on_delete=models.SET_NULL, related_name="+") @@ -564,7 +564,7 @@ def with_job_summary(self): ) ) -class Task(TimestampedModel, _FileSystemRelatedModel): +class Task(TimestampedModel, FileSystemRelatedModel): objects = TaskQuerySet.as_manager() project = models.ForeignKey(Project, on_delete=models.CASCADE, @@ -855,7 +855,7 @@ def _validate_constraints(self, obj: dict[str, Any]): -class Job(TimestampedModel, _FileSystemRelatedModel): +class Job(TimestampedModel, FileSystemRelatedModel): objects = JobQuerySet.as_manager() segment = models.ForeignKey(Segment, on_delete=models.CASCADE) From ed7ae38ef27764d3e1613edd2c1f396d136d6c0f Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 3 Jan 2025 09:48:48 +0100 Subject: [PATCH 39/61] [unit tests] Remove sleep usage --- cvat/apps/engine/tests/test_rest_api.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 625943f40c16..7697a50aa6f8 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3096,7 +3096,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): task_id = self.tasks[0]["id"] user = self.admin - TASK_CACHE_TTL = timedelta(seconds=5) + TASK_CACHE_TTL = timedelta(hours=1) with ( mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=TASK_CACHE_TTL), mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': TASK_CACHE_TTL}), @@ -3127,10 +3127,12 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): file_path = job.return_value() self.assertTrue(os.path.isfile(file_path)) - sleep(TASK_CACHE_TTL.total_seconds() + 1) - - cron_export_cache_cleanup() - mock_clear_export_cache.assert_called_once() + with ( + mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(seconds=0)), + mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + ): + cron_export_cache_cleanup() + mock_clear_export_cache.assert_called_once() self.assertFalse(os.path.exists(file_path)) From 5708cd8dffd543ada3d3b8eef9dd5eb6056fb6a0 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 7 Jan 2025 13:29:14 +0100 Subject: [PATCH 40/61] [Export logic] Use common tmp dir && create do not create tmp dir twice --- cvat/apps/dataset_manager/cron.py | 4 +- cvat/apps/dataset_manager/project.py | 45 +++++--- cvat/apps/dataset_manager/task.py | 76 ++++++++++--- .../tests/test_rest_api_formats.py | 4 +- cvat/apps/dataset_manager/util.py | 106 +++++++++++++++--- cvat/apps/dataset_manager/views.py | 16 +-- cvat/apps/engine/backup.py | 16 ++- cvat/apps/engine/models.py | 10 +- cvat/apps/engine/views.py | 22 ++-- 9 files changed, 223 insertions(+), 76 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index d55207481a38..c6e6eeb9c9af 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -82,9 +82,9 @@ def _cleanup_export_cache(self) -> None: if self._stop_event.is_set(): return - # export cache directory may contain temporary directories + # export cache directory is expected to contain only files if not child.is_file(): - logger.debug(f"The {child.name} is not a file, skipping...") + logger.warning(f"The {child.name} is not a file, skipping...") continue try: diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index 3b57b5817635..2e81c29a444b 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -3,14 +3,15 @@ # # SPDX-License-Identifier: MIT -from collections.abc import Mapping -from tempfile import TemporaryDirectory +from collections.abc import Mapping, Callable +import io import rq -from typing import Any, Callable +from typing import Any from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError from django.db import transaction from django.conf import settings +from django.utils import timezone from cvat.apps.engine import models from cvat.apps.engine.log import DatasetLogManager @@ -18,6 +19,8 @@ from cvat.apps.engine.task import _create_thread as create_task from cvat.apps.engine.rq_job_handler import RQJobMetaField from cvat.apps.dataset_manager.task import TaskAnnotation +from cvat.apps.dataset_manager.util import TmpDirManager +from contextlib import nullcontext from .annotation import AnnotationIR from .bindings import CvatDatasetNotFoundError, ProjectData, load_dataset_data, CvatImportError @@ -25,8 +28,15 @@ dlogger = DatasetLogManager() -def export_project(project_id, dst_file, format_name, - server_url=None, save_images=False): +def export_project( + project_id: int, + dst_file: str, + *, + format_name: str, + server_url: str | None = None, + save_images: bool = False, + temp_dir: str | None = None, +): # For big tasks dump function may run for a long time and # we dont need to acquire lock after the task has been initialized from DB. # But there is the bug with corrupted dump file in case 2 or @@ -38,7 +48,7 @@ def export_project(project_id, dst_file, format_name, exporter = make_exporter(format_name) with open(dst_file, 'wb') as f: - project.export(f, exporter, host=server_url, save_images=save_images) + project.export(f, exporter, host=server_url, save_images=save_images, temp_dir=temp_dir) class ProjectAnnotationAndData: def __init__(self, pk: int): @@ -130,16 +140,27 @@ def init_from_db(self): self.task_annotations[task.id] = annotation self.annotation_irs[task.id] = annotation.ir_data - def export(self, dst_file: str, exporter: Callable, host: str='', **options): + def export( + self, + dst_file: io.BufferedWriter, + exporter: Callable[..., None], + *, + host: str = '', + temp_dir: str | None = None, + **options + ): project_data = ProjectData( annotation_irs=self.annotation_irs, db_project=self.db_project, host=host ) - temp_dir_base = self.db_project.get_tmp_dirname() - - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with ( + TmpDirManager.get_tmp_export_dir( + instance_type=self.db_project.__class__.__name__, + instance_timestamp=timezone.localtime(self.db_project.updated_date).timestamp(), + ) if not temp_dir else nullcontext(temp_dir) + ) as temp_dir: exporter(dst_file, temp_dir, project_data, **options) def load_dataset_data(self, *args, **kwargs): @@ -154,9 +175,7 @@ def import_dataset(self, dataset_file, importer, **options): ) project_data.soft_attribute_import = True - temp_dir_base = self.db_project.get_tmp_dirname() - - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with TmpDirManager.get_tmp_dir() as temp_dir: try: importer(dataset_file, temp_dir, project_data, load_data_callback=self.load_dataset_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index 92bb2fc42680..968e58947e5b 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -1,16 +1,20 @@ # Copyright (C) 2019-2022 Intel Corporation -# Copyright (C) 2022-2024 CVAT.ai Corporation +# Copyright (C) 2022-2025 CVAT.ai Corporation # # SPDX-License-Identifier: MIT +import io import itertools from collections import OrderedDict from copy import deepcopy from enum import Enum -from tempfile import TemporaryDirectory from typing import Optional, Union from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError +from contextlib import nullcontext +from collections.abc import Callable +from django.utils import timezone + from django.db import transaction from django.db.models.query import Prefetch, QuerySet from django.conf import settings @@ -27,7 +31,7 @@ from cvat.apps.dataset_manager.bindings import TaskData, JobData, CvatImportError, CvatDatasetNotFoundError from cvat.apps.dataset_manager.formats.registry import make_exporter, make_importer from cvat.apps.dataset_manager.util import ( - add_prefetch_fields, bulk_create, get_cached, faster_deepcopy + add_prefetch_fields, bulk_create, get_cached, faster_deepcopy, TmpDirManager ) dlogger = DatasetLogManager() @@ -760,15 +764,27 @@ def init_from_db(self): def data(self): return self.ir_data.data - def export(self, dst_file, exporter, host='', **options): + def export( + self, + dst_file: io.BufferedWriter, + exporter: Callable[..., None], + *, + host: str = '', + temp_dir: str | None = None, + **options + ): job_data = JobData( annotation_ir=self.ir_data, db_job=self.db_job, host=host, ) - temp_dir_base = self.db_job.get_tmp_dirname() - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with ( + TmpDirManager.get_tmp_export_dir( + instance_type=self.db_job.__class__.__name__, + instance_timestamp=timezone.localtime(self.db_job.updated_date).timestamp(), + ) if not temp_dir else nullcontext(temp_dir) + ) as temp_dir: exporter(dst_file, temp_dir, job_data, **options) def import_annotations(self, src_file, importer, **options): @@ -779,8 +795,7 @@ def import_annotations(self, src_file, importer, **options): ) self.delete() - temp_dir_base = self.db_job.get_tmp_dirname() - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with TmpDirManager.get_tmp_dir() as temp_dir: try: importer(src_file, temp_dir, job_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: @@ -965,15 +980,27 @@ def init_from_db(self): self._merge_data(gt_annotation.ir_data, start_frame=db_job.segment.start_frame) - def export(self, dst_file, exporter, host='', **options): + def export( + self, + dst_file: io.BufferedWriter, + exporter: Callable[..., None], + *, + host: str = '', + temp_dir: str | None = None, + **options + ): task_data = TaskData( annotation_ir=self.ir_data, db_task=self.db_task, host=host, ) - temp_dir_base = self.db_task.get_tmp_dirname() - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with ( + TmpDirManager.get_tmp_export_dir( + instance_type=self.db_task.__class__.__name__, + instance_timestamp=timezone.localtime(self.db_task.updated_date).timestamp(), + ) if not temp_dir else nullcontext(temp_dir) + ) as temp_dir: exporter(dst_file, temp_dir, task_data, **options) def import_annotations(self, src_file, importer, **options): @@ -984,8 +1011,7 @@ def import_annotations(self, src_file, importer, **options): ) self.delete() - temp_dir_base = self.db_task.get_tmp_dirname() - with TemporaryDirectory(dir=temp_dir_base) as temp_dir: + with TmpDirManager.get_tmp_dir() as temp_dir: try: importer(src_file, temp_dir, task_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: @@ -1047,7 +1073,15 @@ def delete_job_data(pk, *, db_job: models.Job | None = None): annotation.delete() -def export_job(job_id, dst_file, format_name, server_url=None, save_images=False): +def export_job( + job_id: int, + dst_file: str, + *, + format_name: str, + server_url: str | None = None, + save_images=False, + temp_dir: str | None = None, +): # For big tasks dump function may run for a long time and # we dont need to acquire lock after the task has been initialized from DB. # But there is the bug with corrupted dump file in case 2 or @@ -1059,7 +1093,7 @@ def export_job(job_id, dst_file, format_name, server_url=None, save_images=False exporter = make_exporter(format_name) with open(dst_file, 'wb') as f: - job.export(f, exporter, host=server_url, save_images=save_images) + job.export(f, exporter, host=server_url, save_images=save_images, temp_dir=temp_dir) @silk_profile(name="GET task data") @@ -1101,7 +1135,15 @@ def delete_task_data(pk): annotation.delete() -def export_task(task_id, dst_file, format_name, server_url=None, save_images=False): +def export_task( + task_id: int, + *, + dst_file: str, + format_name: str, + server_url: str | None = None, + save_images: bool = False, + temp_dir: str | None = None, + ): # For big tasks dump function may run for a long time and # we dont need to acquire lock after the task has been initialized from DB. # But there is the bug with corrupted dump file in case 2 or @@ -1113,7 +1155,7 @@ def export_task(task_id, dst_file, format_name, server_url=None, save_images=Fal exporter = make_exporter(format_name) with open(dst_file, 'wb') as f: - task.export(f, exporter, host=server_url, save_images=save_images) + task.export(f, exporter, host=server_url, save_images=save_images, temp_dir=temp_dir) @transaction.atomic diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 9a64902e5606..b2eef701cb86 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -2032,8 +2032,8 @@ def test_cleanup_can_remove_file(self): def test_cleanup_can_fail_if_no_file(self): - from cvat.apps.dataset_manager.util import CacheFilePathParseError - with self.assertRaises(CacheFilePathParseError): + from cvat.apps.dataset_manager.util import CacheFileOrDirPathParseError + with self.assertRaises(CacheFileOrDirPathParseError): clear_export_cache(file_path="non existent file path") def test_cleanup_can_defer_removal_if_file_is_used_recently(self): diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index fca03b5185f3..b6aecf747495 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -8,6 +8,7 @@ import os.path as osp import re import zipfile +import tempfile from collections.abc import Generator, Sequence from contextlib import contextmanager from copy import deepcopy @@ -15,6 +16,7 @@ from enum import Enum from threading import Lock from typing import Any +from pathlib import Path import attrs import django_rq @@ -103,7 +105,7 @@ def faster_deepcopy(v): class LockNotAvailableError(Exception): pass -class CacheFilePathParseError(Exception): +class CacheFileOrDirPathParseError(Exception): pass @@ -176,9 +178,9 @@ def values(cls) -> list[str]: class _ParsedExportFilename: file_type: ExportFileType file_ext: str - instance_type: str + instance_type: InstanceType = attrs.field(converter=InstanceType) instance_id: int - instance_timestamp: float + instance_timestamp: float = attrs.field(converter=float) @attrs.frozen @@ -190,6 +192,80 @@ class ParsedDatasetFilename(_ParsedExportFilename): class ParsedBackupFilename(_ParsedExportFilename): pass +@attrs.frozen +class ParsedTmpDirFilename: + instance_type: InstanceType = attrs.field(converter=InstanceType) + instance_timestamp: float = attrs.field(converter=float) + +_not_set = object() + +class TmpDirManager: + SPLITTER = "-" + INSTANCE_PREFIX = "instance" + TMP_ROOT = settings.TMP_FILES_ROOT + + @classmethod + @contextmanager + def get_tmp_dir( + cls, + *, + prefix: str | object = _not_set, + suffix: str | object = _not_set, + ignore_cleanup_errors: bool | object = _not_set, + ) -> Generator[str, Any, Any]: + params = {} + for k, v in { + "prefix": prefix, + "suffix": suffix, + "ignore_cleanup_errors": ignore_cleanup_errors, + }.items(): + if v is not _not_set: + params[k] = v + + with tempfile.TemporaryDirectory(**params, dir=cls.TMP_ROOT) as tmp_dir: + yield tmp_dir + + @classmethod + @contextmanager + def get_tmp_export_dir( + cls, + *, + instance_type: str, + instance_timestamp: float, + ) -> Generator[str, Any, Any]: + instance_type = InstanceType(instance_type.lower()) + with cls.get_tmp_dir( + prefix=cls.SPLITTER.join( + ["export", instance_type, cls.INSTANCE_PREFIX + str(instance_timestamp)] + ) + cls.SPLITTER + ) as tmp_dir: + yield tmp_dir + + @classmethod + def parse_tmp_directory(cls, dir_path: os.PathLike[str]) -> ParsedTmpDirFilename: + dir_path = Path(osp.normpath(dir_path)) + assert dir_path.is_dir() + dir_name = dir_path.name + + basename_match = re.fullmatch( + ( + rf"^export{cls.SPLITTER}(?P{'|'.join(InstanceType.values())})" + rf"{cls.SPLITTER}{cls.INSTANCE_PREFIX}(?P\d+\.\d+){cls.SPLITTER}" + ), + dir_name, + ) + + if not basename_match: + raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") + + try: + parsed_dir_name = ParsedTmpDirFilename( + basename_match.groupdict() + ) + except ValueError as ex: + raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") from ex + + return parsed_dir_name class ExportCacheManager: SPLITTER = "-" @@ -275,7 +351,7 @@ def parse_file_path( ) if not basename_match: - raise CacheFilePathParseError(f"Couldn't parse file name: {basename!r}") + raise CacheFileOrDirPathParseError(f"Couldn't parse file name: {basename!r}") fragments = basename_match.groupdict() fragments["instance_id"] = int(fragments["instance_id"]) @@ -287,7 +363,7 @@ def parse_file_path( try: instance_timestamp, format_repr = unparsed.split(cls.SPLITTER, maxsplit=1) except ValueError: - raise CacheFilePathParseError(f"Couldn't parse file name: {basename!r}") + raise CacheFileOrDirPathParseError(f"Couldn't parse file name: {basename!r}") specific_params["format_repr"] = format_repr ParsedFileNameClass = ParsedDatasetFilename @@ -296,16 +372,16 @@ def parse_file_path( ParsedFileNameClass = ParsedBackupFilename try: - instance_timestamp = float(instance_timestamp) - except ValueError: - raise CacheFilePathParseError(f"Couldn't parse instance timestamp: {instance_timestamp!r}") - - return ParsedFileNameClass( - file_ext=file_ext, - instance_timestamp=instance_timestamp, - **fragments, - **specific_params, - ) + parsed_file_name = ParsedFileNameClass( + file_ext=file_ext, + instance_timestamp=instance_timestamp, + **fragments, + **specific_params, + ) + except ValueError as ex: + raise CacheFileOrDirPathParseError(f"Couldn't parse file name: {basename!r}") from ex + + return parsed_file_name def extend_export_file_lifetime(file_path: str): diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index ffaafb5e14af..fda464592044 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -27,7 +27,7 @@ from .util import ( LockNotAvailableError, current_function_name, get_export_cache_lock, - ExportCacheManager, extend_export_file_lifetime + ExportCacheManager, extend_export_file_lifetime, TmpDirManager ) @@ -132,6 +132,7 @@ def export( db_instance = Job.objects.get(pk=job_id) cache_ttl = get_export_cache_ttl(db_instance) + instance_type = db_instance.__class__.__name__ # As we're not locking the db object here, it can be updated by the time of actual export. # The file will be saved with the older timestamp. @@ -147,7 +148,7 @@ def export( output_path = ExportCacheManager.make_dataset_file_path( instance_id=db_instance.id, - instance_type=db_instance.__class__.__name__, + instance_type=instance_type, instance_timestamp=instance_update_time.timestamp(), save_images=save_images, format_name=dst_format @@ -165,12 +166,13 @@ def export( extend_export_file_lifetime(output_path) return output_path - tmp_dir = db_instance.get_tmp_dirname() - - with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_dir: + with TmpDirManager.get_tmp_export_dir( + instance_type=instance_type, + instance_timestamp=instance_update_time.timestamp(), + ) as temp_dir: temp_file = osp.join(temp_dir, 'result') - export_fn(db_instance.id, temp_file, dst_format, - server_url=server_url, save_images=save_images) + export_fn(db_instance.id, temp_file, dst_format=dst_format, + server_url=server_url, save_images=save_images, temp_dir=temp_dir) with get_export_cache_lock( output_path, ttl=EXPORT_CACHE_LOCK_TTL, diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 339a396f59ab..6a02e88ba1c8 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -57,7 +57,7 @@ from cvat.apps.dataset_manager.views import log_exception from cvat.apps.dataset_manager.bindings import CvatImportError from cvat.apps.dataset_manager.views import EXPORT_CACHE_LOCK_TTL, EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT -from cvat.apps.dataset_manager.util import extend_export_file_lifetime +from cvat.apps.dataset_manager.util import extend_export_file_lifetime, TmpDirManager slogger = ServerLogManager(__name__) @@ -1035,13 +1035,13 @@ def create_backup( cache_ttl: timedelta, ): try: - tmp_dir = db_instance.get_tmp_dirname() + instance_type = db_instance.__class__.__name__ db_instance.refresh_from_db(fields=['updated_date']) instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() output_path = ExportCacheManager.make_backup_file_path( instance_id=db_instance.id, - instance_type=db_instance.__class__.__name__, + instance_type=instance_type, instance_timestamp=instance_timestamp ) @@ -1056,8 +1056,11 @@ def create_backup( extend_export_file_lifetime(output_path) return output_path - with tempfile.TemporaryDirectory(dir=tmp_dir) as temp_dir: - temp_file = os.path.join(temp_dir, 'dump') + # TODO: use another prefix? + with TmpDirManager.get_tmp_export_dir( + instance_type=instance_type, instance_timestamp=instance_timestamp + ) as tmp_dir: + temp_file = os.path.join(tmp_dir, 'dump') exporter = Exporter(db_instance.id) exporter.export_to(temp_file) @@ -1175,7 +1178,8 @@ def _import(importer, request, queue, rq_id, Serializer, file_field_name, locati return Response(serializer.data, status=status.HTTP_202_ACCEPTED) def get_backup_dirname(): - return settings.TMP_FILES_ROOT + # FUTURE-FIXME + return TmpDirManager.TMP_ROOT def import_project(request, queue_name, filename=None): if 'rq_id' in request.data: diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 37d9c143b298..6a8c573ce34e 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -443,11 +443,13 @@ class Meta: def get_dirname(self) -> str: ... - def get_tmp_dirname(self, *, create: bool = True) -> str: + def get_tmp_dirname(self) -> str: + """ + This method returns a directory that is only used + to store temporary files or subfolders related to an object + """ dir_path = os.path.join(self.get_dirname(), "tmp") - - if create: - os.makedirs(dir_path, exist_ok=True) + os.makedirs(dir_path, exist_ok=True) return dir_path diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index eb39f6732c18..6fc701e6631f 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -463,7 +463,7 @@ def get_upload_dir(self): return self._object.get_tmp_dirname() elif 'backup' in self.action: return backup.get_backup_dirname() - return "" + assert False def upload_finished(self, request): if self.action == 'dataset': @@ -471,9 +471,10 @@ def upload_finished(self, request): filename = request.query_params.get("filename", "") conv_mask_to_poly = to_bool(request.query_params.get('conv_mask_to_poly', True)) tmp_dir = self._object.get_tmp_dirname() - uploaded_file = None - if os.path.isfile(os.path.join(tmp_dir, filename)): - uploaded_file = os.path.join(tmp_dir, filename) + uploaded_file = os.path.join(tmp_dir, filename) + if not os.path.isfile(uploaded_file): + uploaded_file = None # TODO: why is this needed + return _import_project_dataset( request=request, filename=uploaded_file, @@ -1064,7 +1065,8 @@ def get_upload_dir(self): return self._object.data.get_upload_dirname() elif 'backup' in self.action: return backup.get_backup_dirname() - return "" + + assert False def _prepare_upload_info_entry(self, filename: str) -> str: filename = osp.normpath(filename) @@ -1142,8 +1144,8 @@ def _handle_upload_annotations(request): filename = request.query_params.get("filename", "") conv_mask_to_poly = to_bool(request.query_params.get('conv_mask_to_poly', True)) tmp_dir = self._object.get_tmp_dirname() - if os.path.isfile(os.path.join(tmp_dir, filename)): - annotation_file = os.path.join(tmp_dir, filename) + annotation_file = os.path.join(tmp_dir, filename) + if os.path.isfile(annotation_file): return _import_annotations( request=request, filename=annotation_file, @@ -1951,8 +1953,8 @@ def upload_finished(self, request): filename = request.query_params.get("filename", "") conv_mask_to_poly = to_bool(request.query_params.get('conv_mask_to_poly', True)) tmp_dir = self.get_upload_dir() - if os.path.isfile(os.path.join(tmp_dir, filename)): - annotation_file = os.path.join(tmp_dir, filename) + annotation_file = os.path.join(tmp_dir, filename) + if os.path.isfile(annotation_file): return _import_annotations( request=request, filename=annotation_file, @@ -2091,7 +2093,7 @@ def upload_finished(self, request): serializer_class=LabeledDataSerializer, parser_classes=_UPLOAD_PARSER_CLASSES, csrf_workaround_is_needed=csrf_workaround_is_needed_for_export) def annotations(self, request, pk): - self._object = self.get_object() # force call of check_object_permissions() + self._object: models.Job = self.get_object() # force call of check_object_permissions() if request.method == 'GET': # FUTURE-TODO: mark as deprecated using this endpoint to export annotations when new API for result file downloading will be implemented return self.export_dataset_v1( From 820f63d340baf557e42286a27716d47ed4490e15 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 7 Jan 2025 13:52:52 +0100 Subject: [PATCH 41/61] Sort imports --- cvat/apps/dataset_manager/project.py | 7 ++++--- cvat/apps/dataset_manager/task.py | 9 ++++----- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 2 +- cvat/apps/dataset_manager/util.py | 4 ++-- cvat/apps/dataset_manager/views.py | 6 ++---- cvat/apps/engine/backup.py | 7 +++---- cvat/apps/engine/models.py | 2 +- cvat/apps/engine/tests/test_rest_api.py | 4 ++-- cvat/apps/engine/tests/utils.py | 5 +++-- 9 files changed, 22 insertions(+), 24 deletions(-) diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index 3c1821af8eec..afdcd9302499 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -3,15 +3,16 @@ # # SPDX-License-Identifier: MIT -from contextlib import nullcontext +import io from collections.abc import Mapping +from contextlib import nullcontext from typing import Any, Callable -import io + import rq from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError from django.conf import settings -from django.utils import timezone from django.db import transaction +from django.utils import timezone from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.util import TmpDirManager diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index b9aa4adfb4ca..b11c85f771c5 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -6,17 +6,16 @@ import io import itertools from collections import OrderedDict +from contextlib import nullcontext from copy import deepcopy from enum import Enum -from typing import Optional, Union, Callable - -from contextlib import nullcontext -from django.utils import timezone +from typing import Callable, Optional, Union from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError from django.conf import settings from django.db import transaction from django.db.models.query import Prefetch, QuerySet +from django.utils import timezone from rest_framework.exceptions import ValidationError from cvat.apps.dataset_manager.annotation import AnnotationIR, AnnotationManager @@ -28,11 +27,11 @@ ) from cvat.apps.dataset_manager.formats.registry import make_exporter, make_importer from cvat.apps.dataset_manager.util import ( + TmpDirManager, add_prefetch_fields, bulk_create, faster_deepcopy, get_cached, - TmpDirManager, ) from cvat.apps.engine import models, serializers from cvat.apps.engine.log import DatasetLogManager diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 639f61471474..5eec5ed9bdac 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -32,8 +32,8 @@ from rest_framework import status import cvat.apps.dataset_manager as dm -from cvat.apps.dataset_manager.cron import clear_export_cache from cvat.apps.dataset_manager.bindings import CvatTaskOrJobDataExtractor, TaskData +from cvat.apps.dataset_manager.cron import clear_export_cache from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.tests.utils import TestDir from cvat.apps.dataset_manager.util import get_export_cache_lock diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index b6aecf747495..d53f3ff2c5ca 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -7,16 +7,16 @@ import os import os.path as osp import re -import zipfile import tempfile +import zipfile from collections.abc import Generator, Sequence from contextlib import contextmanager from copy import deepcopy from datetime import timedelta from enum import Enum +from pathlib import Path from threading import Lock from typing import Any -from pathlib import Path import attrs import django_rq diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 553e161a54f1..23249b517d99 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -24,17 +24,15 @@ from cvat.apps.engine.utils import get_rq_lock_by_user from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS -from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import from .util import ( + ExportCacheManager, LockNotAvailableError, + TmpDirManager, current_function_name, extend_export_file_lifetime, get_export_cache_lock, - ExportCacheManager, - TmpDirManager, ) - slogger = ServerLogManager(__name__) _MODULE_NAME = __package__ + '.' + osp.splitext(osp.basename(__file__))[0] diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 36b90c76dc5a..fef27f2a1657 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -8,16 +8,15 @@ import os import re import shutil -import tempfile import uuid from abc import ABCMeta, abstractmethod from collections.abc import Collection, Iterable +from datetime import timedelta from enum import Enum from logging import Logger from tempfile import NamedTemporaryFile -from typing import Any, Optional, Union, Type +from typing import Any, Optional, Type, Union from zipfile import ZipFile -from datetime import timedelta import django_rq from django.conf import settings @@ -31,8 +30,8 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatImportError -from cvat.apps.dataset_manager.views import log_exception from cvat.apps.dataset_manager.util import ExportCacheManager, TmpDirManager +from cvat.apps.dataset_manager.views import log_exception from cvat.apps.engine import models from cvat.apps.engine.cloud_provider import import_resource_from_cloud_storage from cvat.apps.engine.location import StorageType, get_location_configuration diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 6a8c573ce34e..5099670cf739 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -22,8 +22,8 @@ from django.core.files.storage import FileSystemStorage from django.db import IntegrityError, models, transaction from django.db.models import Q, TextChoices -from django.db.models.fields import FloatField from django.db.models.base import ModelBase +from django.db.models.fields import FloatField from django.utils.translation import gettext_lazy as _ from drf_spectacular.types import OpenApiTypes from drf_spectacular.utils import extend_schema_field diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index c74959f22dd4..cb613c60d048 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: MIT -import django_rq import copy import io import json @@ -27,6 +26,7 @@ from unittest import mock import av +import django_rq import numpy as np from django.conf import settings from django.contrib.auth.models import Group, User @@ -3108,7 +3108,7 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.dataset_manager.cron import cron_export_cache_cleanup, clear_export_cache + from cvat.apps.dataset_manager.cron import clear_export_cache, cron_export_cache_cleanup self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin diff --git a/cvat/apps/engine/tests/utils.py b/cvat/apps/engine/tests/utils.py index d0189d96592c..15d2adb65451 100644 --- a/cvat/apps/engine/tests/utils.py +++ b/cvat/apps/engine/tests/utils.py @@ -5,11 +5,13 @@ import itertools import logging import os +import shutil from collections.abc import Iterator, Sequence from contextlib import contextmanager from io import BytesIO +from pathlib import Path from typing import Any, Callable, TypeVar -import shutil + import av import django_rq import numpy as np @@ -17,7 +19,6 @@ from django.core.cache import caches from django.http.response import HttpResponse from PIL import Image -from pathlib import Path from rest_framework.test import APITestCase T = TypeVar('T') From eaa302492341801a3a377eb9211ef1d9bf1a1dd5 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 7 Jan 2025 13:57:07 +0100 Subject: [PATCH 42/61] Fix merging issues --- cvat/apps/dataset_manager/views.py | 1 - cvat/apps/engine/backup.py | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 23249b517d99..3fad15301f95 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -6,7 +6,6 @@ import logging import os import os.path as osp -import tempfile from datetime import timedelta from os.path import exists as osp_exists diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index fef27f2a1657..1dda056d9397 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -30,8 +30,20 @@ import cvat.apps.dataset_manager as dm from cvat.apps.dataset_manager.bindings import CvatImportError -from cvat.apps.dataset_manager.util import ExportCacheManager, TmpDirManager -from cvat.apps.dataset_manager.views import log_exception +from cvat.apps.dataset_manager.util import ( + ExportCacheManager, + TmpDirManager, + extend_export_file_lifetime, + get_export_cache_lock, +) +from cvat.apps.dataset_manager.views import ( + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, + EXPORT_CACHE_LOCK_TTL, + EXPORT_LOCKED_RETRY_INTERVAL, + LockNotAvailableError, + log_exception, + retry_current_rq_job, +) from cvat.apps.engine import models from cvat.apps.engine.cloud_provider import import_resource_from_cloud_storage from cvat.apps.engine.location import StorageType, get_location_configuration From 0f88933f1350b260ce5e2cbde766dd9fd3954ad5 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 8 Jan 2025 13:27:14 +0100 Subject: [PATCH 43/61] Cron job to clean up tmp files/dirs && small fixes --- cvat/apps/dataset_manager/cron.py | 120 +++++++++++++++++++++------ cvat/apps/dataset_manager/project.py | 1 - cvat/apps/dataset_manager/task.py | 4 +- cvat/apps/dataset_manager/util.py | 88 +++++++++++++------- cvat/apps/dataset_manager/views.py | 7 +- cvat/apps/engine/backup.py | 5 +- cvat/apps/engine/models.py | 9 +- cvat/settings/base.py | 13 ++- 8 files changed, 170 insertions(+), 77 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index c6e6eeb9c9af..11327f5f9338 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -6,17 +6,28 @@ import os import os.path as osp +import shutil +from abc import ABCMeta, abstractmethod from datetime import timedelta from functools import wraps +from pathlib import Path from threading import Event, Thread from time import sleep -from typing import Callable +from typing import Callable, ClassVar from django.conf import settings from django.utils import timezone +from django.utils.module_loading import import_string from rq import get_current_job -from cvat.apps.dataset_manager.util import ExportCacheManager, get_export_cache_lock +from cvat.apps.dataset_manager.util import ( + CacheFileOrDirPathParseError, + ExportCacheManager, + OperationType, + TmpDirManager, + TmpEntityType, + get_export_cache_lock, +) from cvat.apps.dataset_manager.views import ( EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, EXPORT_CACHE_LOCK_TTL, @@ -39,7 +50,7 @@ def wrapper(self: CleanupExportCacheThread): return wrapper -def clear_export_cache(file_path: str) -> None: +def clear_export_cache(file_path: Path) -> bool: with get_export_cache_lock( file_path, block=True, @@ -50,30 +61,86 @@ def clear_export_cache(file_path: str) -> None: cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): - logger.debug(f"Export cache file {file_path!r} was recently accessed") - return + logger.debug(f"Export cache file {file_path.name!r} was recently accessed") + return False os.remove(file_path) - logger.debug(f"Export cache file {file_path!r} was successfully removed") + logger.debug(f"Export cache file {file_path.name!r} was successfully removed") + return True + + +def remove_tmp_dir(dir_path: str) -> bool: + # we do not use locks here when handling a temporary directories + # because undesired race conditions are not possible here: + # 1. A temporary directory can be removed while parsing its name or checking the last modification date. + # In that case an exception is expected and will be handled by the cron job. + # 2. A temporary directory can be removed by a worker only when it is outdated. + # 3. Each temporary directory has a unique name, so the race condition when one process is creating a directory + # and another is removing it - impossible. + parsed = TmpDirManager.parse_tmp_child(dir_path) + assert parsed.operation == OperationType.EXPORT + assert parsed.type == TmpEntityType.DIR + cache_ttl = get_export_cache_ttl(parsed.instance_type) + + if timezone.now().timestamp() <= osp.getmtime(dir_path) + cache_ttl.total_seconds(): + return False + shutil.rmtree(dir_path) + logger.debug(f"Temporary directory {dir_path} was successfully removed") + return True + + +class BaseCleanupThread(Thread, metaclass=ABCMeta): + description: ClassVar[str] -class CleanupExportCacheThread(Thread): def __init__(self, stop_event: Event, *args, **kwargs) -> None: self._stop_event = stop_event - self._removed_files_count = 0 + self._removed_entities = 0 self._exception = None - super().__init__(*args, **kwargs, target=self._cleanup_export_cache) + super().__init__(*args, **kwargs, target=self._cleanup) @property - def removed_files_count(self) -> int: - return self._removed_files_count + def removed_entities(self) -> int: + return self._removed_entities + + @abstractmethod + def _cleanup(self) -> None: ... def set_exception(self, ex: Exception) -> None: assert isinstance(ex, Exception) self._exception = ex + def raise_if_exception(self) -> None: + if isinstance(self._exception, Exception): + raise self._exception + + +class CleanupTmpDirThread(BaseCleanupThread): + description: ClassVar[str] = "Cleanup common temporary directory" + @suppress_exceptions - def _cleanup_export_cache(self) -> None: + def _cleanup(self) -> None: + for dir_path in TmpDirManager.get_export_related_dirs(): + # stop clean up process correctly before rq job timeout is ended + if self._stop_event.is_set(): + return + + try: + if remove_tmp_dir(dir_path): + self._removed_entities += 1 + except CacheFileOrDirPathParseError: + logger.warning(f"Cannot parse {dir_path.name}, skipping...") + continue + + except Exception: + log_exception(logger) + + +class CleanupExportCacheThread(BaseCleanupThread): + description: ClassVar[str] = "Cleanup export cache" + + @suppress_exceptions + def _cleanup(self) -> None: export_cache_dir_path = settings.EXPORT_CACHE_ROOT assert os.path.exists(export_cache_dir_path) @@ -88,17 +155,20 @@ def _cleanup_export_cache(self) -> None: continue try: - clear_export_cache(child) - self._removed_files_count += 1 + if clear_export_cache(child): + self._removed_entities += 1 + except CacheFileOrDirPathParseError: + logger.warning(f"Cannot parse {child.name}, skipping...") + continue + except Exception: log_exception(logger) - def raise_if_exception(self) -> None: - if isinstance(self._exception, Exception): - raise self._exception +def cleanup(thread_class_path: str) -> None: + ThreadClass = import_string(thread_class_path) + assert issubclass(ThreadClass, BaseCleanupThread) -def cron_export_cache_cleanup() -> None: started_at = timezone.now() rq_job = get_current_job() seconds_left = rq_job.timeout - 60 @@ -107,11 +177,11 @@ def cron_export_cache_cleanup() -> None: finish_before = started_at + timedelta(seconds=seconds_left) stop_event = Event() - cleanup_export_cache_thread = CleanupExportCacheThread(stop_event=stop_event) - cleanup_export_cache_thread.start() + cleanup_thread = ThreadClass(stop_event=stop_event) + cleanup_thread.start() while timezone.now() < finish_before: - if not cleanup_export_cache_thread.is_alive(): + if not cleanup_thread.is_alive(): stop_event.set() break sleep(sleep_interval) @@ -119,12 +189,12 @@ def cron_export_cache_cleanup() -> None: if not stop_event.is_set(): stop_event.set() - cleanup_export_cache_thread.join() - cleanup_export_cache_thread.raise_if_exception() + cleanup_thread.join() + cleanup_thread.raise_if_exception() finished_at = timezone.now() logger.info( - f"Export cache cleanup has been successfully " + f"The {cleanup_thread.description!r} process has been successfully " f"completed after {int((finished_at - started_at).total_seconds())} seconds. " - f"{cleanup_export_cache_thread.removed_files_count} files have been removed" + f"{cleanup_thread.removed_entities} elements have been removed" ) diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index afdcd9302499..c72ab380a58a 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -158,7 +158,6 @@ def export( with ( TmpDirManager.get_tmp_export_dir( instance_type=self.db_project.__class__.__name__, - instance_timestamp=timezone.localtime(self.db_project.updated_date).timestamp(), ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: exporter(dst_file, temp_dir, project_data, **options) diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index b11c85f771c5..9c614a4c5003 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -788,7 +788,6 @@ def export( with ( TmpDirManager.get_tmp_export_dir( instance_type=self.db_job.__class__.__name__, - instance_timestamp=timezone.localtime(self.db_job.updated_date).timestamp(), ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: exporter(dst_file, temp_dir, job_data, **options) @@ -1004,7 +1003,6 @@ def export( with ( TmpDirManager.get_tmp_export_dir( instance_type=self.db_task.__class__.__name__, - instance_timestamp=timezone.localtime(self.db_task.updated_date).timestamp(), ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: exporter(dst_file, temp_dir, task_data, **options) @@ -1143,8 +1141,8 @@ def delete_task_data(pk): def export_task( task_id: int, - *, dst_file: str, + *, format_name: str, server_url: str | None = None, save_images: bool = False, diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index d53f3ff2c5ca..5e04e6a70a0a 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -155,7 +155,15 @@ def get_export_cache_lock( if acquired: lock.release() +class OperationType(str, Enum): + EXPORT = "export" + IMPORT = "import" + @classmethod + def values(cls) -> list[str]: + return list(map(lambda x: x.value, cls)) + +# todo: rename class ExportFileType(str, Enum): ANNOTATIONS = "annotations" BACKUP = "backup" @@ -192,26 +200,42 @@ class ParsedDatasetFilename(_ParsedExportFilename): class ParsedBackupFilename(_ParsedExportFilename): pass +class TmpEntityType(str, Enum): + DIR = "dir" + FILE = "file" + @attrs.frozen -class ParsedTmpDirFilename: +class ParsedTmpEntity: instance_type: InstanceType = attrs.field(converter=InstanceType) - instance_timestamp: float = attrs.field(converter=float) + operation: OperationType = attrs.field(converter=OperationType) + +@attrs.frozen +class ParsedTmpDir(ParsedTmpEntity): + type: TmpEntityType = attrs.field(init=False, default=TmpEntityType.DIR) + +@attrs.frozen +class ParsedTmpFile(ParsedTmpEntity): + type: TmpEntityType = attrs.field(init=False, default=TmpEntityType.FILE) -_not_set = object() class TmpDirManager: SPLITTER = "-" - INSTANCE_PREFIX = "instance" TMP_ROOT = settings.TMP_FILES_ROOT + @classmethod + def get_export_related_dirs(cls) -> Generator[Path, Any, Any]: + for item in Path(cls.TMP_ROOT).glob(f"{OperationType.EXPORT}*"): + if item.is_dir(): + yield item + @classmethod @contextmanager def get_tmp_dir( cls, *, - prefix: str | object = _not_set, - suffix: str | object = _not_set, - ignore_cleanup_errors: bool | object = _not_set, + prefix: str | None = None, + suffix: str | None = None, + ignore_cleanup_errors: bool | None = None, ) -> Generator[str, Any, Any]: params = {} for k, v in { @@ -219,7 +243,7 @@ def get_tmp_dir( "suffix": suffix, "ignore_cleanup_errors": ignore_cleanup_errors, }.items(): - if v is not _not_set: + if v is not None: params[k] = v with tempfile.TemporaryDirectory(**params, dir=cls.TMP_ROOT) as tmp_dir: @@ -231,41 +255,41 @@ def get_tmp_export_dir( cls, *, instance_type: str, - instance_timestamp: float, ) -> Generator[str, Any, Any]: instance_type = InstanceType(instance_type.lower()) with cls.get_tmp_dir( - prefix=cls.SPLITTER.join( - ["export", instance_type, cls.INSTANCE_PREFIX + str(instance_timestamp)] - ) + cls.SPLITTER + prefix=cls.SPLITTER.join([OperationType.EXPORT, instance_type]) + cls.SPLITTER ) as tmp_dir: yield tmp_dir @classmethod - def parse_tmp_directory(cls, dir_path: os.PathLike[str]) -> ParsedTmpDirFilename: - dir_path = Path(osp.normpath(dir_path)) - assert dir_path.is_dir() - dir_name = dir_path.name + def parse_tmp_child(cls, child_path: os.PathLike[str]) -> ParsedTmpDir | ParsedTmpFile: + child_path = Path(osp.normpath(child_path)) + + if child_path.is_dir(): + dir_name = child_path.name + + basename_match = re.match( + ( + rf"^(?P{'|'.join(OperationType.values())}){cls.SPLITTER}" + rf"(?P{'|'.join(InstanceType.values())}){cls.SPLITTER}" + ), + dir_name, + ) - basename_match = re.fullmatch( - ( - rf"^export{cls.SPLITTER}(?P{'|'.join(InstanceType.values())})" - rf"{cls.SPLITTER}{cls.INSTANCE_PREFIX}(?P\d+\.\d+){cls.SPLITTER}" - ), - dir_name, - ) + if not basename_match: + raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") - if not basename_match: - raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") + try: + parsed_dir_name = ParsedTmpDir( + **basename_match.groupdict() + ) + except ValueError as ex: + raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") from ex - try: - parsed_dir_name = ParsedTmpDirFilename( - basename_match.groupdict() - ) - except ValueError as ex: - raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") from ex + return parsed_dir_name - return parsed_dir_name + raise NotImplementedError() class ExportCacheManager: SPLITTER = "-" diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 3fad15301f95..20c14a73eb5f 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -167,12 +167,9 @@ def export( extend_export_file_lifetime(output_path) return output_path - with TmpDirManager.get_tmp_export_dir( - instance_type=instance_type, - instance_timestamp=instance_update_time.timestamp(), - ) as temp_dir: + with TmpDirManager.get_tmp_export_dir(instance_type=instance_type) as temp_dir: temp_file = osp.join(temp_dir, 'result') - export_fn(db_instance.id, temp_file, dst_format=dst_format, + export_fn(db_instance.id, temp_file, format_name=dst_format, server_url=server_url, save_images=save_images, temp_dir=temp_dir) with get_export_cache_lock( output_path, diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 1dda056d9397..c32c136f248e 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1083,10 +1083,7 @@ def create_backup( extend_export_file_lifetime(output_path) return output_path - # TODO: use another prefix? - with TmpDirManager.get_tmp_export_dir( - instance_type=instance_type, instance_timestamp=instance_timestamp - ) as tmp_dir: + with TmpDirManager.get_tmp_export_dir(instance_type=instance_type) as tmp_dir: temp_file = os.path.join(tmp_dir, 'dump') exporter = Exporter(db_instance.id) exporter.export_to(temp_file) diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 5099670cf739..e012c75bef1c 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -435,18 +435,15 @@ def touch(self) -> None: class ABCModelMeta(ABCMeta, ModelBase): pass -class FileSystemRelatedModel(models.Model, metaclass=ABCModelMeta): - class Meta: - abstract = True - +class FileSystemRelatedModel(metaclass=ABCModelMeta): @abstractmethod def get_dirname(self) -> str: ... def get_tmp_dirname(self) -> str: """ - This method returns a directory that is only used - to store temporary files or subfolders related to an object + The method returns a directory that is only used + to store temporary files or folders related to the object """ dir_path = os.path.join(self.get_dirname(), "tmp") os.makedirs(dir_path, exist_ok=True) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 2cbd180e4844..c28da9b9f7aa 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -356,9 +356,20 @@ class CVAT_QUEUES(Enum): { 'queue': CVAT_QUEUES.CLEANING.value, 'id': 'cron_export_cache_cleanup', - 'func': 'cvat.apps.dataset_manager.cron.cron_export_cache_cleanup', + 'func': 'cvat.apps.dataset_manager.cron.cleanup', # Run twice a day (at midnight and at noon) 'cron_string': '0 0,12 * * *', + # 'cron_string': '50 11 * * *', + 'args': ('cvat.apps.dataset_manager.cron.CleanupExportCacheThread',), + }, + { + 'queue': CVAT_QUEUES.CLEANING.value, + 'id': 'cron_tmp_directory_cleanup', + 'func': 'cvat.apps.dataset_manager.cron.cleanup', + # Run once a day + 'cron_string': '0 18 * * *', + # 'cron_string': '17 12 * * *', + 'args': ('cvat.apps.dataset_manager.cron.CleanupTmpDirThread',), } ] From 7a1db7b04fd4536dba03c9907a806eca9ea89be1 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 8 Jan 2025 17:25:13 +0100 Subject: [PATCH 44/61] Add exportcachecleanup command --- .../dataset_manager/management/__init__.py | 4 ++ .../management/commands/__init__.py | 4 ++ .../management/commands/exportcachecleanup.py | 46 +++++++++++++++++++ .../administration/advanced/upgrade_guide.md | 42 +---------------- 4 files changed, 55 insertions(+), 41 deletions(-) create mode 100644 cvat/apps/dataset_manager/management/__init__.py create mode 100644 cvat/apps/dataset_manager/management/commands/__init__.py create mode 100644 cvat/apps/dataset_manager/management/commands/exportcachecleanup.py diff --git a/cvat/apps/dataset_manager/management/__init__.py b/cvat/apps/dataset_manager/management/__init__.py new file mode 100644 index 000000000000..71cfbec65515 --- /dev/null +++ b/cvat/apps/dataset_manager/management/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2025 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + diff --git a/cvat/apps/dataset_manager/management/commands/__init__.py b/cvat/apps/dataset_manager/management/commands/__init__.py new file mode 100644 index 000000000000..71cfbec65515 --- /dev/null +++ b/cvat/apps/dataset_manager/management/commands/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2025 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + diff --git a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py new file mode 100644 index 000000000000..0ee98ec7c3be --- /dev/null +++ b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py @@ -0,0 +1,46 @@ +# Copyright (C) 2025 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +import os +import shutil +from pathlib import Path + +from django.core.management.base import BaseCommand +from django.utils import timezone + +from cvat.apps.engine.models import Job, Project, Task + + +class Command(BaseCommand): + help = "Cleanup outdated export cache" + + def handle(self, *args, **options): + def update_progress(): + progress = (i + 1) / objects_count + done = int(progress_bar_len * progress) + progress_bar = "#" * done + "-" * (progress_bar_len - done) + print(f"\rProgress: |{progress_bar}| {progress:.0%}", end="", flush=True) + + now = timezone.now() + progress_bar_len = os.get_terminal_size().columns // 2 + + for Model in (Project, Task, Job): + print(f"\nDeleting the export cache for {Model.__name__.lower()}s...") + queryset = Model.objects.filter(created_date__lt=now) + objects_count = queryset.count() + if objects_count < 1: + continue + + msg = ( + f"\nThe {objects_count} folders are going to be checked" + if objects_count > 1 + else "\nThe 1 folder is going to be checked" + ) + print(msg) + + for i, obj in enumerate(queryset.iterator()): + update_progress() + export_cache_dir = Path(obj.get_dirname()) / "export_cache" + if export_cache_dir.exists(): + shutil.rmtree(export_cache_dir) diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 4d23a8a6fa7e..cf1bb43df8aa 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -59,48 +59,8 @@ To upgrade CVAT, follow these steps: ## Upgrade CVAT after v2.25.0 In version 2.25.0, CVAT changed the location where the export cache is stored. -The following Python script can be used to remove outdated files from the previous location: +To clean up the outdated cache, run the following command: `python manage.py exportcachecleanup`. -```python -import shutil -from pathlib import Path -from django.utils import timezone -from tqdm import tqdm -from cvat.apps.engine.models import Job, Project, Task - - -def cleanup_outdated_cache(): - now = timezone.now() - - for Model in (Project, Task, Job): - print(f"Deleting the export cache for {Model.__name__.lower()}s...") - queryset = Model.objects.filter(created_date__lt=now) - objects_count = queryset.count() - if objects_count < 1: - continue - - print(f"The {objects_count} folder{'s' if objects_count > 1 else ''} are going to be checked") - - for obj in tqdm(queryset.iterator(), total=objects_count): - export_cache_dir = Path(obj.get_dirname()) / "export_cache" - if export_cache_dir.exists(): - shutil.rmtree(export_cache_dir) - - -if __name__ == "__main__": - cleanup_outdated_cache() - -``` - -### How to run the script - -1. Save the script as `cleanup_script.py` in a directory where `manage.py` is located -1. Run Django shell command: `python manage.py shell` -1. Import and execute the script: - ```python - from cleanup_script import cleanup_outdated_cache - cleanup_outdated_cache() - ``` ## How to upgrade CVAT from v2.2.0 to v2.3.0. From 476c68872b40710f5ac9667aee30f3413358caa9 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 11:29:46 +0100 Subject: [PATCH 45/61] Refactor a bit --- ...41224_150942_maria_clear_cache_cron_job.md | 8 ++ cvat/apps/dataset_manager/cron.py | 65 +++++++--------- cvat/apps/dataset_manager/project.py | 5 +- cvat/apps/dataset_manager/task.py | 9 +-- .../tests/test_rest_api_formats.py | 15 ++-- cvat/apps/dataset_manager/util.py | 77 +++---------------- cvat/apps/dataset_manager/views.py | 2 +- cvat/apps/engine/backup.py | 2 +- cvat/apps/engine/tests/test_rest_api.py | 6 +- cvat/settings/base.py | 5 +- dev/format_python_code.sh | 1 + 11 files changed, 72 insertions(+), 123 deletions(-) diff --git a/changelog.d/20241224_150942_maria_clear_cache_cron_job.md b/changelog.d/20241224_150942_maria_clear_cache_cron_job.md index 52ab9046fc97..e56e1debfe9e 100644 --- a/changelog.d/20241224_150942_maria_clear_cache_cron_job.md +++ b/changelog.d/20241224_150942_maria_clear_cache_cron_job.md @@ -1,3 +1,11 @@ +### Added + +- Setting `TMP_FILE_OR_DIR_RETENTION_DAYS`, which defines maximum retention period + of a file or dir in temporary directory + () +- Cron job to remove outdated files and directories from CVAT tmp directory + () + ### Changed - Export cache cleaning moved to a separate cron job diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index 11327f5f9338..fffb4136a39c 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -1,4 +1,4 @@ -# Copyright (C) 2024 CVAT.ai Corporation +# Copyright (C) 2025 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -23,9 +23,7 @@ from cvat.apps.dataset_manager.util import ( CacheFileOrDirPathParseError, ExportCacheManager, - OperationType, TmpDirManager, - TmpEntityType, get_export_cache_lock, ) from cvat.apps.dataset_manager.views import ( @@ -57,10 +55,10 @@ def clear_export_cache(file_path: Path) -> bool: acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ttl=EXPORT_CACHE_LOCK_TTL, ): - parsed_filename = ExportCacheManager.parse_file_path(file_path) + parsed_filename = ExportCacheManager.parse_filename(file_path.name) cache_ttl = get_export_cache_ttl(parsed_filename.instance_type) - if timezone.now().timestamp() <= osp.getmtime(file_path) + cache_ttl.total_seconds(): + if timezone.now().timestamp() <= file_path.stat().st_mtime + cache_ttl.total_seconds(): logger.debug(f"Export cache file {file_path.name!r} was recently accessed") return False @@ -69,39 +67,19 @@ def clear_export_cache(file_path: Path) -> bool: return True -def remove_tmp_dir(dir_path: str) -> bool: - # we do not use locks here when handling a temporary directories - # because undesired race conditions are not possible here: - # 1. A temporary directory can be removed while parsing its name or checking the last modification date. - # In that case an exception is expected and will be handled by the cron job. - # 2. A temporary directory can be removed by a worker only when it is outdated. - # 3. Each temporary directory has a unique name, so the race condition when one process is creating a directory - # and another is removing it - impossible. - parsed = TmpDirManager.parse_tmp_child(dir_path) - assert parsed.operation == OperationType.EXPORT - assert parsed.type == TmpEntityType.DIR - cache_ttl = get_export_cache_ttl(parsed.instance_type) - - if timezone.now().timestamp() <= osp.getmtime(dir_path) + cache_ttl.total_seconds(): - return False - - shutil.rmtree(dir_path) - logger.debug(f"Temporary directory {dir_path} was successfully removed") - return True - class BaseCleanupThread(Thread, metaclass=ABCMeta): description: ClassVar[str] def __init__(self, stop_event: Event, *args, **kwargs) -> None: self._stop_event = stop_event - self._removed_entities = 0 + self._number_of_removed_objects = 0 self._exception = None super().__init__(*args, **kwargs, target=self._cleanup) @property - def removed_entities(self) -> int: - return self._removed_entities + def number_of_removed_objects(self) -> int: + return self._number_of_removed_objects @abstractmethod def _cleanup(self) -> None: ... @@ -120,18 +98,33 @@ class CleanupTmpDirThread(BaseCleanupThread): @suppress_exceptions def _cleanup(self) -> None: - for dir_path in TmpDirManager.get_export_related_dirs(): + # we do not use locks here when handling objects from tmp directory + # because undesired race conditions are not possible here: + # 1. A temporary file/directory can be removed while checking access time. + # In that case an exception is expected and is handled by the cron process. + # 2. A temporary file/directory can be removed by the cron job only when it is outdated. + # 3. Each temporary file/directory has a unique name, so the race condition when one process is creating an object + # and another is removing it - impossible. + for child in os.scandir(TmpDirManager.TMP_ROOT): # stop clean up process correctly before rq job timeout is ended if self._stop_event.is_set(): return try: - if remove_tmp_dir(dir_path): - self._removed_entities += 1 - except CacheFileOrDirPathParseError: - logger.warning(f"Cannot parse {dir_path.name}, skipping...") + if ( + child.stat().st_atime + timedelta( + days=TmpDirManager.TMP_FILE_OR_DIR_RETENTION_DAYS + ).total_seconds() < timezone.now().timestamp() + ): + if child.is_dir(): + shutil.rmtree(child.path) + else: + os.remove(child.path) + logger.debug(f"The {child.name} was successfully removed") + self._number_of_removed_objects += 1 + except FileNotFoundError: + # file or directory has been removed by another process continue - except Exception: log_exception(logger) @@ -156,7 +149,7 @@ def _cleanup(self) -> None: try: if clear_export_cache(child): - self._removed_entities += 1 + self._number_of_removed_objects += 1 except CacheFileOrDirPathParseError: logger.warning(f"Cannot parse {child.name}, skipping...") continue @@ -196,5 +189,5 @@ def cleanup(thread_class_path: str) -> None: logger.info( f"The {cleanup_thread.description!r} process has been successfully " f"completed after {int((finished_at - started_at).total_seconds())} seconds. " - f"{cleanup_thread.removed_entities} elements have been removed" + f"{cleanup_thread.number_of_removed_objects} elements have been removed" ) diff --git a/cvat/apps/dataset_manager/project.py b/cvat/apps/dataset_manager/project.py index c72ab380a58a..8f91e4f12651 100644 --- a/cvat/apps/dataset_manager/project.py +++ b/cvat/apps/dataset_manager/project.py @@ -12,7 +12,6 @@ from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError from django.conf import settings from django.db import transaction -from django.utils import timezone from cvat.apps.dataset_manager.task import TaskAnnotation from cvat.apps.dataset_manager.util import TmpDirManager @@ -156,7 +155,7 @@ def export( ) with ( - TmpDirManager.get_tmp_export_dir( + TmpDirManager.get_tmp_directory_for_export( instance_type=self.db_project.__class__.__name__, ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: @@ -174,7 +173,7 @@ def import_dataset(self, dataset_file, importer, **options): ) project_data.soft_attribute_import = True - with TmpDirManager.get_tmp_dir() as temp_dir: + with TmpDirManager.get_tmp_directory() as temp_dir: try: importer(dataset_file, temp_dir, project_data, load_data_callback=self.load_dataset_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py index 9c614a4c5003..ebc007dbb898 100644 --- a/cvat/apps/dataset_manager/task.py +++ b/cvat/apps/dataset_manager/task.py @@ -15,7 +15,6 @@ from django.conf import settings from django.db import transaction from django.db.models.query import Prefetch, QuerySet -from django.utils import timezone from rest_framework.exceptions import ValidationError from cvat.apps.dataset_manager.annotation import AnnotationIR, AnnotationManager @@ -786,7 +785,7 @@ def export( ) with ( - TmpDirManager.get_tmp_export_dir( + TmpDirManager.get_tmp_directory_for_export( instance_type=self.db_job.__class__.__name__, ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: @@ -800,7 +799,7 @@ def import_annotations(self, src_file, importer, **options): ) self.delete() - with TmpDirManager.get_tmp_dir() as temp_dir: + with TmpDirManager.get_tmp_directory() as temp_dir: try: importer(src_file, temp_dir, job_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: @@ -1001,7 +1000,7 @@ def export( ) with ( - TmpDirManager.get_tmp_export_dir( + TmpDirManager.get_tmp_directory_for_export( instance_type=self.db_task.__class__.__name__, ) if not temp_dir else nullcontext(temp_dir) ) as temp_dir: @@ -1015,7 +1014,7 @@ def import_annotations(self, src_file, importer, **options): ) self.delete() - with TmpDirManager.get_tmp_dir() as temp_dir: + with TmpDirManager.get_tmp_directory() as temp_dir: try: importer(src_file, temp_dir, task_data, **options) except (DatasetNotFoundError, CvatDatasetNotFoundError) as not_found: diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 5eec5ed9bdac..50817e674ad9 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -21,6 +21,7 @@ from typing import Any, Callable, ClassVar, Optional, overload from unittest.mock import DEFAULT as MOCK_DEFAULT from unittest.mock import MagicMock, patch +from pathlib import Path import av import numpy as np @@ -1520,7 +1521,7 @@ def _clear(*_, file_path: str): side_effect(set_condition, clear_removed_the_file), ) - clear_export_cache(file_path=file_path) + clear_export_cache(file_path=Path(file_path)) set_condition(clear_has_been_finished) mock_os_remove.assert_not_called() @@ -1699,7 +1700,7 @@ def _clear(*_, file_path: str): exited_by_timeout = False try: - clear_export_cache(file_path=file_path) + clear_export_cache(file_path=Path(file_path)) except LockNotAvailableError: # should come from waiting for get_export_cache_lock exited_by_timeout = True @@ -2027,7 +2028,7 @@ def test_cleanup_can_remove_file(self): patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): export_path = export(dst_format=format_name, task_id=task_id) - clear_export_cache(file_path=export_path) + clear_export_cache(file_path=Path(export_path)) self.assertFalse(osp.isfile(export_path)) @@ -2035,7 +2036,7 @@ def test_cleanup_can_remove_file(self): def test_cleanup_can_fail_if_no_file(self): from cvat.apps.dataset_manager.util import CacheFileOrDirPathParseError with self.assertRaises(CacheFileOrDirPathParseError): - clear_export_cache(file_path="non existent file path") + clear_export_cache(file_path=Path("non existent file path")) def test_cleanup_can_defer_removal_if_file_is_used_recently(self): from os import remove as original_remove @@ -2050,13 +2051,13 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): patch("cvat.apps.dataset_manager.cron.os.remove", side_effect=original_remove) as mock_os_remove, ): export_path = export(dst_format=format_name, task_id=task_id) - clear_export_cache(file_path=export_path) + clear_export_cache(file_path=Path(export_path)) mock_os_remove.assert_not_called() self.assertTrue(osp.isfile(export_path)) def test_cleanup_cron_job_can_delete_cached_files(self): - from cvat.apps.dataset_manager.cron import cron_export_cache_cleanup + from cvat.apps.dataset_manager.cron import cleanup def _get_project_task_job_ids(): project = self._create_project(projects["main"]) @@ -2100,7 +2101,7 @@ def _get_project_task_job_ids(): ): mock_rq_job = MagicMock(timeout=100) mock_rq_get_current_job.return_value = mock_rq_job - cron_export_cache_cleanup() + cleanup('cvat.apps.dataset_manager.cron.CleanupExportCacheThread') mock_clear_export_cache.assert_called_once() self.assertFalse(osp.exists(export_path)) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 5e04e6a70a0a..ef2ef26f75d8 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -1,5 +1,5 @@ # Copyright (C) 2019-2022 Intel Corporation -# Copyright (C) 2023-2024 CVAT.ai Corporation +# Copyright (C) 2023-2025 CVAT.ai Corporation # # SPDX-License-Identifier: MIT @@ -14,7 +14,6 @@ from copy import deepcopy from datetime import timedelta from enum import Enum -from pathlib import Path from threading import Lock from typing import Any @@ -157,13 +156,11 @@ def get_export_cache_lock( class OperationType(str, Enum): EXPORT = "export" - IMPORT = "import" @classmethod def values(cls) -> list[str]: return list(map(lambda x: x.value, cls)) -# todo: rename class ExportFileType(str, Enum): ANNOTATIONS = "annotations" BACKUP = "backup" @@ -200,43 +197,25 @@ class ParsedDatasetFilename(_ParsedExportFilename): class ParsedBackupFilename(_ParsedExportFilename): pass -class TmpEntityType(str, Enum): - DIR = "dir" - FILE = "file" - -@attrs.frozen -class ParsedTmpEntity: - instance_type: InstanceType = attrs.field(converter=InstanceType) - operation: OperationType = attrs.field(converter=OperationType) - -@attrs.frozen -class ParsedTmpDir(ParsedTmpEntity): - type: TmpEntityType = attrs.field(init=False, default=TmpEntityType.DIR) - -@attrs.frozen -class ParsedTmpFile(ParsedTmpEntity): - type: TmpEntityType = attrs.field(init=False, default=TmpEntityType.FILE) - class TmpDirManager: SPLITTER = "-" TMP_ROOT = settings.TMP_FILES_ROOT - - @classmethod - def get_export_related_dirs(cls) -> Generator[Path, Any, Any]: - for item in Path(cls.TMP_ROOT).glob(f"{OperationType.EXPORT}*"): - if item.is_dir(): - yield item + TMP_FILE_OR_DIR_RETENTION_DAYS = settings.TMP_FILE_OR_DIR_RETENTION_DAYS @classmethod @contextmanager - def get_tmp_dir( + def get_tmp_directory( cls, *, prefix: str | None = None, suffix: str | None = None, ignore_cleanup_errors: bool | None = None, ) -> Generator[str, Any, Any]: + """ + The method allows to create a temporary directory and + ensures that the parent directory uses the CVAT tmp directory + """ params = {} for k, v in { "prefix": prefix, @@ -251,45 +230,17 @@ def get_tmp_dir( @classmethod @contextmanager - def get_tmp_export_dir( + def get_tmp_directory_for_export( cls, *, instance_type: str, ) -> Generator[str, Any, Any]: instance_type = InstanceType(instance_type.lower()) - with cls.get_tmp_dir( + with cls.get_tmp_directory( prefix=cls.SPLITTER.join([OperationType.EXPORT, instance_type]) + cls.SPLITTER ) as tmp_dir: yield tmp_dir - @classmethod - def parse_tmp_child(cls, child_path: os.PathLike[str]) -> ParsedTmpDir | ParsedTmpFile: - child_path = Path(osp.normpath(child_path)) - - if child_path.is_dir(): - dir_name = child_path.name - - basename_match = re.match( - ( - rf"^(?P{'|'.join(OperationType.values())}){cls.SPLITTER}" - rf"(?P{'|'.join(InstanceType.values())}){cls.SPLITTER}" - ), - dir_name, - ) - - if not basename_match: - raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") - - try: - parsed_dir_name = ParsedTmpDir( - **basename_match.groupdict() - ) - except ValueError as ex: - raise CacheFileOrDirPathParseError(f"Couldn't parse directory name: {dir_name!r}") from ex - - return parsed_dir_name - - raise NotImplementedError() class ExportCacheManager: SPLITTER = "-" @@ -355,15 +306,11 @@ def make_backup_file_path( return osp.join(settings.EXPORT_CACHE_ROOT, filename) @classmethod - def parse_file_path( - cls, file_path: os.PathLike[str], + def parse_filename( + cls, filename: str, ) -> ParsedDatasetFilename | ParsedBackupFilename: - file_path = osp.normpath(file_path) - basename = osp.split(file_path)[1] - basename, file_ext = osp.splitext(basename) + basename, file_ext = osp.splitext(filename) file_ext = file_ext.strip(".").lower() - - # handle file name basename_match = re.fullmatch( ( rf"^(?P{'|'.join(InstanceType.values())})" diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 20c14a73eb5f..2812dd4f3162 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -167,7 +167,7 @@ def export( extend_export_file_lifetime(output_path) return output_path - with TmpDirManager.get_tmp_export_dir(instance_type=instance_type) as temp_dir: + with TmpDirManager.get_tmp_directory_for_export(instance_type=instance_type) as temp_dir: temp_file = osp.join(temp_dir, 'result') export_fn(db_instance.id, temp_file, format_name=dst_format, server_url=server_url, save_images=save_images, temp_dir=temp_dir) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index c32c136f248e..0bb9868b5ae3 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1083,7 +1083,7 @@ def create_backup( extend_export_file_lifetime(output_path) return output_path - with TmpDirManager.get_tmp_export_dir(instance_type=instance_type) as tmp_dir: + with TmpDirManager.get_tmp_directory_for_export(instance_type=instance_type) as tmp_dir: temp_file = os.path.join(tmp_dir, 'dump') exporter = Exporter(db_instance.id) exporter.export_to(temp_file) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index cb613c60d048..2b477702f8fb 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3108,7 +3108,7 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.dataset_manager.cron import clear_export_cache, cron_export_cache_cleanup + from cvat.apps.dataset_manager.cron import clear_export_cache, cleanup self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin @@ -3127,7 +3127,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): ): mock_rq_job = mock.MagicMock(timeout=100) mock_rq_get_current_job.return_value = mock_rq_job - cron_export_cache_cleanup() + cleanup("cvat.apps.dataset_manager.cron.CleanupExportCacheThread") mock_clear_export_cache.assert_not_called() response = self._run_api_v2_tasks_id_export(task_id, user) @@ -3148,7 +3148,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(seconds=0)), mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): - cron_export_cache_cleanup() + cleanup("cvat.apps.dataset_manager.cron.CleanupExportCacheThread") mock_clear_export_cache.assert_called_once() self.assertFalse(os.path.exists(file_path)) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index c28da9b9f7aa..73ef6c4f7cc1 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -359,7 +359,6 @@ class CVAT_QUEUES(Enum): 'func': 'cvat.apps.dataset_manager.cron.cleanup', # Run twice a day (at midnight and at noon) 'cron_string': '0 0,12 * * *', - # 'cron_string': '50 11 * * *', 'args': ('cvat.apps.dataset_manager.cron.CleanupExportCacheThread',), }, { @@ -368,7 +367,6 @@ class CVAT_QUEUES(Enum): 'func': 'cvat.apps.dataset_manager.cron.cleanup', # Run once a day 'cron_string': '0 18 * * *', - # 'cron_string': '17 12 * * *', 'args': ('cvat.apps.dataset_manager.cron.CleanupTmpDirThread',), } ] @@ -766,3 +764,6 @@ class CVAT_QUEUES(Enum): CLOUD_DATA_DOWNLOADING_MAX_THREADS_NUMBER = 4 CLOUD_DATA_DOWNLOADING_NUMBER_OF_FILES_PER_THREAD = 1000 + +# Indicates the maximum number of days a file or directory is retained in the temporary directory +TMP_FILE_OR_DIR_RETENTION_DAYS = 14 diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index f7220679073c..6d4288ec1b4a 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -36,6 +36,7 @@ for paths in \ "cvat/apps/dataset_manager/tests/utils.py" \ "cvat/apps/events/signals.py" \ "cvat/apps/engine/management/commands/syncperiodicjobs.py" \ + "cvat/apps/dataset_manager/management/commands/exportcachecleanup.py" \ ; do ${BLACK} -- ${paths} ${ISORT} -- ${paths} From edd2268905efbf8e6b2456a15cc3555c76c72ec9 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 12:04:34 +0100 Subject: [PATCH 46/61] Fix imports sorting --- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 2 +- cvat/apps/engine/tests/test_rest_api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 50817e674ad9..181da6a504ec 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -16,12 +16,12 @@ from datetime import timedelta from functools import partial from io import BytesIO +from pathlib import Path from tempfile import TemporaryDirectory from time import sleep from typing import Any, Callable, ClassVar, Optional, overload from unittest.mock import DEFAULT as MOCK_DEFAULT from unittest.mock import MagicMock, patch -from pathlib import Path import av import numpy as np diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 2b477702f8fb..5d80971184fd 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3108,7 +3108,7 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.dataset_manager.cron import clear_export_cache, cleanup + from cvat.apps.dataset_manager.cron import cleanup, clear_export_cache self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin From ae68b9b41ac70392f2d465366da39b35b54787db Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 12:07:12 +0100 Subject: [PATCH 47/61] Remove todo --- cvat/apps/dataset_manager/cron.py | 1 - cvat/apps/engine/views.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index fffb4136a39c..80cb8ec089d9 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -5,7 +5,6 @@ from __future__ import annotations import os -import os.path as osp import shutil from abc import ABCMeta, abstractmethod from datetime import timedelta diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index 82e6b46ba054..6a5691dc1182 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -541,7 +541,7 @@ def upload_finished(self, request): tmp_dir = self._object.get_tmp_dirname() uploaded_file = os.path.join(tmp_dir, filename) if not os.path.isfile(uploaded_file): - uploaded_file = None # TODO: why is this needed + uploaded_file = None return _import_project_dataset( request=request, From aab5350c5f4485eaa77736bc489cb6eb929e0e08 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 12:29:42 +0100 Subject: [PATCH 48/61] Run CI From 716f13ced1eea2cbf9aa03c8687106c6958c2d66 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 13:58:37 +0100 Subject: [PATCH 49/61] Try to fix tests --- cvat/apps/dataset_manager/cron.py | 2 +- cvat/apps/dataset_manager/tests/test_formats.py | 4 ++-- cvat/apps/dataset_manager/views.py | 4 +++- cvat/apps/engine/backup.py | 1 - 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index 80cb8ec089d9..4444c661416e 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -103,7 +103,7 @@ def _cleanup(self) -> None: # In that case an exception is expected and is handled by the cron process. # 2. A temporary file/directory can be removed by the cron job only when it is outdated. # 3. Each temporary file/directory has a unique name, so the race condition when one process is creating an object - # and another is removing it - impossible. + # and another is removing it - impossible. for child in os.scandir(TmpDirManager.TMP_ROOT): # stop clean up process correctly before rq job timeout is ended if self._stop_event.is_set(): diff --git a/cvat/apps/dataset_manager/tests/test_formats.py b/cvat/apps/dataset_manager/tests/test_formats.py index 097884092de0..eb4e70785637 100644 --- a/cvat/apps/dataset_manager/tests/test_formats.py +++ b/cvat/apps/dataset_manager/tests/test_formats.py @@ -262,7 +262,7 @@ def _test_export(check, task, format_name, **export_args): with tempfile.TemporaryDirectory() as temp_dir: file_path = osp.join(temp_dir, format_name) dm.task.export_task(task["id"], file_path, - format_name, **export_args) + format_name=format_name, **export_args) check(file_path) @@ -989,7 +989,7 @@ def _test_can_import_annotations(self, task, import_format): if import_format == "CVAT 1.1": export_format = "CVAT for images 1.1" - dm.task.export_task(task["id"], file_path, export_format) + dm.task.export_task(task["id"], file_path, format_name=export_format) expected_ann = TaskAnnotation(task["id"]) expected_ann.init_from_db() diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 2812dd4f3162..74ff344c60f2 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -169,8 +169,10 @@ def export( with TmpDirManager.get_tmp_directory_for_export(instance_type=instance_type) as temp_dir: temp_file = osp.join(temp_dir, 'result') + temp_subdir = osp.join(temp_dir, 'subdir') + os.makedirs(temp_subdir, exist_ok=True) export_fn(db_instance.id, temp_file, format_name=dst_format, - server_url=server_url, save_images=save_images, temp_dir=temp_dir) + server_url=server_url, save_images=save_images, temp_dir=temp_subdir) with get_export_cache_lock( output_path, ttl=EXPORT_CACHE_LOCK_TTL, diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 0bb9868b5ae3..d85f3289bfad 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1202,7 +1202,6 @@ def _import(importer, request, queue, rq_id, Serializer, file_field_name, locati return Response(serializer.data, status=status.HTTP_202_ACCEPTED) def get_backup_dirname(): - # FUTURE-FIXME return TmpDirManager.TMP_ROOT def import_project(request, queue_name, filename=None): From 7abd5ab8f65c4e56a24d04dae426d2987d62c8b8 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 9 Jan 2025 18:09:58 +0100 Subject: [PATCH 50/61] os.replace -> shutil.move --- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 8 ++++---- cvat/apps/dataset_manager/views.py | 7 ++++++- cvat/apps/engine/backup.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index f67cf62e6fea..31197342f13f 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1486,7 +1486,7 @@ def patched_log_exception(logger=None, exc_info=True): # only after checking whether a file exists inside an acquired lock patch("cvat.apps.dataset_manager.views.osp_exists") as mock_osp_exists, patch( - "cvat.apps.dataset_manager.views.os.replace", side_effect=original_replace + "cvat.apps.dataset_manager.views.shutil.move", side_effect=original_replace ) as mock_os_replace, patch("cvat.apps.dataset_manager.views.log_exception", new=patched_log_exception), patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, @@ -1861,7 +1861,7 @@ def test_export_can_reuse_older_file_if_still_relevant(self): patch( "cvat.apps.dataset_manager.views.osp_exists", side_effect=original_exists ) as mock_osp_exists, - patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, + patch("cvat.apps.dataset_manager.views.shutil.move") as mock_os_replace, ): second_export_path = export(dst_format=format_name, task_id=task_id) @@ -1908,7 +1908,7 @@ def _export_1( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), - patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, + patch("cvat.apps.dataset_manager.views.shutil.move") as mock_os_replace, patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): @@ -1948,7 +1948,7 @@ def _export_2( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, ), - patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, + patch("cvat.apps.dataset_manager.views.shutil.move") as mock_os_replace, patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 74ff344c60f2..351bd4c76491 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -6,6 +6,7 @@ import logging import os import os.path as osp +import shutil from datetime import timedelta from os.path import exists as osp_exists @@ -169,16 +170,20 @@ def export( with TmpDirManager.get_tmp_directory_for_export(instance_type=instance_type) as temp_dir: temp_file = osp.join(temp_dir, 'result') + # create a subdirectory to store export-related files, + # which will be fully included in the resulting archive temp_subdir = osp.join(temp_dir, 'subdir') os.makedirs(temp_subdir, exist_ok=True) + export_fn(db_instance.id, temp_file, format_name=dst_format, server_url=server_url, save_images=save_images, temp_dir=temp_subdir) + with get_export_cache_lock( output_path, ttl=EXPORT_CACHE_LOCK_TTL, acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): - os.replace(temp_file, output_path) + shutil.move(temp_file, output_path) logger.info( f"The {db_instance.__class__.__name__.lower()} '{db_instance.id}' is exported " diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index d85f3289bfad..b983d39b86f3 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -1094,7 +1094,7 @@ def create_backup( acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ttl=EXPORT_CACHE_LOCK_TTL, ): - os.replace(temp_file, output_path) + shutil.move(temp_file, output_path) logger.info( f"The {db_instance.__class__.__name__.lower()} '{db_instance.id}' is backed up at {output_path!r} " From f3c4120c77fffdf0f12c1e76d4611dc30f27084a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 10 Jan 2025 11:29:26 +0100 Subject: [PATCH 51/61] Write to self.stdout --- cvat/apps/dataset_manager/cron.py | 7 +++---- .../management/commands/exportcachecleanup.py | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index 4444c661416e..c97648bffb7f 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -66,7 +66,6 @@ def clear_export_cache(file_path: Path) -> bool: return True - class BaseCleanupThread(Thread, metaclass=ABCMeta): description: ClassVar[str] @@ -111,9 +110,9 @@ def _cleanup(self) -> None: try: if ( - child.stat().st_atime + timedelta( - days=TmpDirManager.TMP_FILE_OR_DIR_RETENTION_DAYS - ).total_seconds() < timezone.now().timestamp() + child.stat().st_atime + + timedelta(days=TmpDirManager.TMP_FILE_OR_DIR_RETENTION_DAYS).total_seconds() + < timezone.now().timestamp() ): if child.is_dir(): shutil.rmtree(child.path) diff --git a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py index 0ee98ec7c3be..710ec7f91f85 100644 --- a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py +++ b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py @@ -20,24 +20,24 @@ def update_progress(): progress = (i + 1) / objects_count done = int(progress_bar_len * progress) progress_bar = "#" * done + "-" * (progress_bar_len - done) - print(f"\rProgress: |{progress_bar}| {progress:.0%}", end="", flush=True) + self.stdout.write(f"\rProgress: |{progress_bar}| {progress:.0%}", ending="") now = timezone.now() progress_bar_len = os.get_terminal_size().columns // 2 for Model in (Project, Task, Job): - print(f"\nDeleting the export cache for {Model.__name__.lower()}s...") + self.stdout.write(f"\nDeleting the export cache for {Model.__name__.lower()}s...") queryset = Model.objects.filter(created_date__lt=now) objects_count = queryset.count() if objects_count < 1: continue msg = ( - f"\nThe {objects_count} folders are going to be checked" + f"The {objects_count} folders are going to be checked" if objects_count > 1 - else "\nThe 1 folder is going to be checked" + else "The 1 folder is going to be checked" ) - print(msg) + self.stdout.write(msg) for i, obj in enumerate(queryset.iterator()): update_progress() From 15cf95882a5fc246a7b95ec694847609a9d206bd Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 10 Jan 2025 18:43:57 +0100 Subject: [PATCH 52/61] Reduce max lifetime of tmp files/dirs --- cvat/settings/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 73ef6c4f7cc1..14e5db44c81d 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -766,4 +766,4 @@ class CVAT_QUEUES(Enum): CLOUD_DATA_DOWNLOADING_NUMBER_OF_FILES_PER_THREAD = 1000 # Indicates the maximum number of days a file or directory is retained in the temporary directory -TMP_FILE_OR_DIR_RETENTION_DAYS = 14 +TMP_FILE_OR_DIR_RETENTION_DAYS = 3 From e393a1051d5660789d9e7e7abe3da18d5ed5667a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 10 Jan 2025 18:48:58 +0100 Subject: [PATCH 53/61] Suppress exception --- .../dataset_manager/management/commands/exportcachecleanup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py index 710ec7f91f85..be453db13508 100644 --- a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py +++ b/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py @@ -4,6 +4,7 @@ import os import shutil +from contextlib import suppress from pathlib import Path from django.core.management.base import BaseCommand @@ -42,5 +43,5 @@ def update_progress(): for i, obj in enumerate(queryset.iterator()): update_progress() export_cache_dir = Path(obj.get_dirname()) / "export_cache" - if export_cache_dir.exists(): + with suppress(FileNotFoundError): shutil.rmtree(export_cache_dir) From 606c3821914a3025944498b85398e3e35bc3ef2d Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 10 Jan 2025 19:39:26 +0100 Subject: [PATCH 54/61] Update doc --- .../administration/advanced/upgrade_guide.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index cf1bb43df8aa..0d0892b79bf4 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -59,8 +59,20 @@ To upgrade CVAT, follow these steps: ## Upgrade CVAT after v2.25.0 In version 2.25.0, CVAT changed the location where the export cache is stored. -To clean up the outdated cache, run the following command: `python manage.py exportcachecleanup`. - +To clean up the outdated cache, run the command depending on how CVAT is deployed: + +{{< tabpane lang="shell" >}} + {{< tab header="Docker" >}} + docker exec -it cvat_server python manage.py exportcachecleanup + {{< /tab >}} + {{< tab header="Kubernetes" >}} + cvat_backend_pod=$(kubectl get pods -l component=server -o 'jsonpath={.items[0].metadata.name}') + kubectl exec -it ${cvat_backend_pod} -- python manage.py exportcachecleanup + {{< /tab >}} + {{< tab header="Development" >}} + python manage.py exportcachecleanup + {{< /tab >}} +{{< /tabpane >}} ## How to upgrade CVAT from v2.2.0 to v2.3.0. From 30b120fc7ce16838b93060aa7254466127488446 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 13 Jan 2025 13:11:09 +0100 Subject: [PATCH 55/61] Allow calling cron_export_cache_cleanup via runperiodicjob --- cvat/apps/dataset_manager/cron.py | 54 ++++++++++++------- .../management/commands/runperiodicjob.py | 3 +- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index c97648bffb7f..9619686a505b 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -8,15 +8,15 @@ import shutil from abc import ABCMeta, abstractmethod from datetime import timedelta +from enum import Enum from functools import wraps from pathlib import Path from threading import Event, Thread from time import sleep -from typing import Callable, ClassVar +from typing import Callable, ClassVar, Type from django.conf import settings from django.utils import timezone -from django.utils.module_loading import import_string from rq import get_current_job from cvat.apps.dataset_manager.util import ( @@ -156,31 +156,47 @@ def _cleanup(self) -> None: log_exception(logger) -def cleanup(thread_class_path: str) -> None: - ThreadClass = import_string(thread_class_path) - assert issubclass(ThreadClass, BaseCleanupThread) +class CleanupType(str, Enum): + EXPORT_CACHE = "export_cache" + TEMP_DIRECTORY = "temp_directory" + + def to_thread_class(self) -> Type[CleanupExportCacheThread | CleanupTmpDirThread]: + if CleanupType.EXPORT_CACHE == self.value: + ThreadClass = CleanupExportCacheThread + elif CleanupType.TEMP_DIRECTORY == self.value: + ThreadClass = CleanupTmpDirThread + else: + raise ValueError(f"Unknown cleaning type: {self.value}") + return ThreadClass - started_at = timezone.now() - rq_job = get_current_job() - seconds_left = rq_job.timeout - 60 - sleep_interval = 10 - assert seconds_left > sleep_interval - finish_before = started_at + timedelta(seconds=seconds_left) +def cleanup(cleanup_type: CleanupType) -> None: + started_at = timezone.now() stop_event = Event() + ThreadClass = CleanupType(cleanup_type).to_thread_class() cleanup_thread = ThreadClass(stop_event=stop_event) - cleanup_thread.start() - while timezone.now() < finish_before: - if not cleanup_thread.is_alive(): + if rq_job := get_current_job(): + seconds_left = rq_job.timeout - 60 + sleep_interval = 10 + assert seconds_left > sleep_interval + finish_before = started_at + timedelta(seconds=seconds_left) + cleanup_thread.start() + + while timezone.now() < finish_before: + if not cleanup_thread.is_alive(): + stop_event.set() + break + sleep(sleep_interval) + + if not stop_event.is_set(): stop_event.set() - break - sleep(sleep_interval) - if not stop_event.is_set(): - stop_event.set() + cleanup_thread.join() + else: + # run func in the current thread + cleanup_thread.run() - cleanup_thread.join() cleanup_thread.raise_if_exception() finished_at = timezone.now() diff --git a/cvat/apps/engine/management/commands/runperiodicjob.py b/cvat/apps/engine/management/commands/runperiodicjob.py index 765f16541cfd..25ad62fd6e74 100644 --- a/cvat/apps/engine/management/commands/runperiodicjob.py +++ b/cvat/apps/engine/management/commands/runperiodicjob.py @@ -10,6 +10,7 @@ class Command(BaseCommand): def add_arguments(self, parser: ArgumentParser) -> None: parser.add_argument("job_id", help="ID of the job to run") + parser.add_argument("job_args", nargs="*", help="Arguments to pass to the job") def handle(self, *args, **options): job_id = options["job_id"] @@ -17,7 +18,7 @@ def handle(self, *args, **options): for job_definition in settings.PERIODIC_RQ_JOBS: if job_definition["id"] == job_id: job_func = import_string(job_definition["func"]) - job_func() + job_func(*options["job_args"]) return raise CommandError(f"Job with ID {job_id} not found") From 1f2057c44617e740cb933b9509516fbd86cb8f23 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 13 Jan 2025 13:56:08 +0100 Subject: [PATCH 56/61] Pass object id to background job --- cvat/apps/engine/background.py | 2 +- cvat/apps/engine/backup.py | 43 +++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index e2050064122f..f2b5d0e89b6d 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -713,7 +713,7 @@ def setup_background_job( func = self.export_callback func_args = ( - self.db_instance, + self.db_instance.id, Exporter, logger, cache_ttl, diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index b983d39b86f3..41fbee2815d8 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -15,11 +15,12 @@ from enum import Enum from logging import Logger from tempfile import NamedTemporaryFile -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Type, Union, ClassVar from zipfile import ZipFile import django_rq from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from django.db import transaction from django.utils import timezone from rest_framework import serializers, status @@ -342,6 +343,8 @@ def _get_db_jobs(self): return () class _ExporterBase(metaclass=ABCMeta): + ModelClass: ClassVar[models.Project | models.Task] + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -373,7 +376,18 @@ def _write_directory(self, source_dir, zip_object, target_dir, recursive=True, e def export_to(self, file: str | ZipFile, target_dir: str | None = None): ... + @classmethod + def get_object(cls, pk: int) -> models.Project | models.Task: + # FUTURE-FIXME: need to check permissions one more time when background task is called + try: + return cls.ModelClass.objects.get(pk=pk) + except ObjectDoesNotExist: + raise ValidationError(f'Such a {cls.ModelClass.__name__.lower()} does not exist') + + class TaskExporter(_ExporterBase, _TaskBackupBase): + ModelClass: ClassVar[models.Task] = models.Task + def __init__(self, pk, version=Version.V1): super().__init__(logger=slogger.task[pk]) @@ -934,9 +948,11 @@ def _prepare_project_meta(self, project): return self._prepare_meta(allowed_fields, project) class ProjectExporter(_ExporterBase, _ProjectBackupBase): + ModelClass: ClassVar[models.Project] = models.Project + def __init__(self, pk, version=Version.V1): super().__init__(logger=slogger.project[pk]) - self._db_project = models.Project.objects.prefetch_related('tasks', 'annotation_guide__assets').select_related('annotation_guide').get(pk=pk) + self._db_project = self.ModelClass.objects.prefetch_related('tasks', 'annotation_guide__assets').select_related('annotation_guide').get(pk=pk) self._version = version db_labels = self._db_project.label_set.all().prefetch_related('attributespec_set') @@ -1055,23 +1071,22 @@ def _import_project(filename, user, org_id): def create_backup( - # FUTURE-FIXME: there db_instance_id should be passed - db_instance: models.Project | models.Task, + instance_id: int, Exporter: Type[ProjectExporter | TaskExporter], logger: Logger, cache_ttl: timedelta, ): - try: - instance_type = db_instance.__class__.__name__ - db_instance.refresh_from_db(fields=['updated_date']) - instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() - - output_path = ExportCacheManager.make_backup_file_path( - instance_id=db_instance.id, - instance_type=instance_type, - instance_timestamp=instance_timestamp - ) + db_instance = Exporter.get_object(instance_id) + instance_type = db_instance.__class__.__name__ + instance_timestamp = timezone.localtime(db_instance.updated_date).timestamp() + + output_path = ExportCacheManager.make_backup_file_path( + instance_id=db_instance.id, + instance_type=instance_type, + instance_timestamp=instance_timestamp + ) + try: with get_export_cache_lock( output_path, block=True, From 28d6d9fd1002f59e479c898d853e986c4ad66aa1 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 13 Jan 2025 13:56:37 +0100 Subject: [PATCH 57/61] Update site/content/en/docs/administration/advanced/upgrade_guide.md Co-authored-by: Maxim Zhiltsov --- site/content/en/docs/administration/advanced/upgrade_guide.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 0d0892b79bf4..5ca168dcf62d 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -61,6 +61,8 @@ To upgrade CVAT, follow these steps: In version 2.25.0, CVAT changed the location where the export cache is stored. To clean up the outdated cache, run the command depending on how CVAT is deployed: + + {{< tabpane lang="shell" >}} {{< tab header="Docker" >}} docker exec -it cvat_server python manage.py exportcachecleanup @@ -74,6 +76,8 @@ To clean up the outdated cache, run the command depending on how CVAT is deploye {{< /tab >}} {{< /tabpane >}} + + ## How to upgrade CVAT from v2.2.0 to v2.3.0. Step by step commands how to upgrade CVAT from v2.2.0 to v2.3.0. From 93e9c5d37c1728c469aeb61846fe7b45830d0fb3 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 13 Jan 2025 21:35:20 +0100 Subject: [PATCH 58/61] Apply comments --- cvat/apps/dataset_manager/cron.py | 47 +++++++------------ ...cleanup.py => cleanuplegacyexportcache.py} | 7 ++- .../tests/test_rest_api_formats.py | 6 +-- cvat/apps/dataset_manager/util.py | 3 -- .../management/commands/runperiodicjob.py | 6 ++- cvat/apps/engine/tests/test_rest_api.py | 6 +-- cvat/settings/base.py | 8 ++-- dev/format_python_code.sh | 2 +- .../administration/advanced/upgrade_guide.md | 6 +-- 9 files changed, 37 insertions(+), 54 deletions(-) rename cvat/apps/dataset_manager/management/commands/{exportcachecleanup.py => cleanuplegacyexportcache.py} (87%) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index 9619686a505b..e15123ed80da 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -92,7 +92,7 @@ def raise_if_exception(self) -> None: class CleanupTmpDirThread(BaseCleanupThread): - description: ClassVar[str] = "Cleanup common temporary directory" + description: ClassVar[str] = "common temporary directory cleanup" @suppress_exceptions def _cleanup(self) -> None: @@ -128,7 +128,7 @@ def _cleanup(self) -> None: class CleanupExportCacheThread(BaseCleanupThread): - description: ClassVar[str] = "Cleanup export cache" + description: ClassVar[str] = "export cache cleanup" @suppress_exceptions def _cleanup(self) -> None: @@ -156,43 +156,22 @@ def _cleanup(self) -> None: log_exception(logger) -class CleanupType(str, Enum): - EXPORT_CACHE = "export_cache" - TEMP_DIRECTORY = "temp_directory" - - def to_thread_class(self) -> Type[CleanupExportCacheThread | CleanupTmpDirThread]: - if CleanupType.EXPORT_CACHE == self.value: - ThreadClass = CleanupExportCacheThread - elif CleanupType.TEMP_DIRECTORY == self.value: - ThreadClass = CleanupTmpDirThread - else: - raise ValueError(f"Unknown cleaning type: {self.value}") - return ThreadClass - - -def cleanup(cleanup_type: CleanupType) -> None: +def cleanup(ThreadClass: Type[CleanupExportCacheThread | CleanupTmpDirThread]) -> None: + assert issubclass(ThreadClass, BaseCleanupThread) started_at = timezone.now() stop_event = Event() - ThreadClass = CleanupType(cleanup_type).to_thread_class() cleanup_thread = ThreadClass(stop_event=stop_event) if rq_job := get_current_job(): seconds_left = rq_job.timeout - 60 - sleep_interval = 10 - assert seconds_left > sleep_interval - finish_before = started_at + timedelta(seconds=seconds_left) - cleanup_thread.start() + assert seconds_left > 0 - while timezone.now() < finish_before: - if not cleanup_thread.is_alive(): - stop_event.set() - break - sleep(sleep_interval) + cleanup_thread.start() + cleanup_thread.join(timeout=seconds_left) - if not stop_event.is_set(): + if cleanup_thread.is_alive(): stop_event.set() - - cleanup_thread.join() + cleanup_thread.join() else: # run func in the current thread cleanup_thread.run() @@ -205,3 +184,11 @@ def cleanup(cleanup_type: CleanupType) -> None: f"completed after {int((finished_at - started_at).total_seconds())} seconds. " f"{cleanup_thread.number_of_removed_objects} elements have been removed" ) + + +def cleanup_tmp_directory() -> None: + cleanup(CleanupTmpDirThread) + + +def cleanup_export_cache_directory() -> None: + cleanup(CleanupExportCacheThread) diff --git a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py b/cvat/apps/dataset_manager/management/commands/cleanuplegacyexportcache.py similarity index 87% rename from cvat/apps/dataset_manager/management/commands/exportcachecleanup.py rename to cvat/apps/dataset_manager/management/commands/cleanuplegacyexportcache.py index be453db13508..6775b8db7c91 100644 --- a/cvat/apps/dataset_manager/management/commands/exportcachecleanup.py +++ b/cvat/apps/dataset_manager/management/commands/cleanuplegacyexportcache.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: MIT -import os import shutil from contextlib import suppress from pathlib import Path @@ -24,7 +23,7 @@ def update_progress(): self.stdout.write(f"\rProgress: |{progress_bar}| {progress:.0%}", ending="") now = timezone.now() - progress_bar_len = os.get_terminal_size().columns // 2 + progress_bar_len = shutil.get_terminal_size().columns // 2 for Model in (Project, Task, Job): self.stdout.write(f"\nDeleting the export cache for {Model.__name__.lower()}s...") @@ -34,9 +33,9 @@ def update_progress(): continue msg = ( - f"The {objects_count} folders are going to be checked" + f"{objects_count} folders are going to be checked" if objects_count > 1 - else "The 1 folder is going to be checked" + else "1 folder is going to be checked" ) self.stdout.write(msg) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 31197342f13f..c112cae63110 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1449,7 +1449,7 @@ def test_concurrent_export_and_cleanup(self): export_checked_the_file = self.SharedBool() clear_has_been_finished = self.SharedBool() clear_removed_the_file = self.SharedBool() - export_outdated_after = timedelta(seconds=1) + export_outdated_after = timedelta(seconds=4) EXPORT_CACHE_LOCK_TTL = 4 EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_CACHE_LOCK_TTL * 2 @@ -2062,7 +2062,7 @@ def test_cleanup_can_defer_removal_if_file_is_used_recently(self): self.assertTrue(osp.isfile(export_path)) def test_cleanup_cron_job_can_delete_cached_files(self): - from cvat.apps.dataset_manager.cron import cleanup + from cvat.apps.dataset_manager.cron import cleanup_export_cache_directory def _get_project_task_job_ids(): project = self._create_project(projects["main"]) @@ -2106,7 +2106,7 @@ def _get_project_task_job_ids(): ): mock_rq_job = MagicMock(timeout=100) mock_rq_get_current_job.return_value = mock_rq_job - cleanup('cvat.apps.dataset_manager.cron.CleanupExportCacheThread') + cleanup_export_cache_directory() mock_clear_export_cache.assert_called_once() self.assertFalse(osp.exists(export_path)) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index ef2ef26f75d8..f2d2e05001d2 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -157,9 +157,6 @@ def get_export_cache_lock( class OperationType(str, Enum): EXPORT = "export" - @classmethod - def values(cls) -> list[str]: - return list(map(lambda x: x.value, cls)) class ExportFileType(str, Enum): ANNOTATIONS = "annotations" diff --git a/cvat/apps/engine/management/commands/runperiodicjob.py b/cvat/apps/engine/management/commands/runperiodicjob.py index 25ad62fd6e74..29745a63e8c3 100644 --- a/cvat/apps/engine/management/commands/runperiodicjob.py +++ b/cvat/apps/engine/management/commands/runperiodicjob.py @@ -10,7 +10,6 @@ class Command(BaseCommand): def add_arguments(self, parser: ArgumentParser) -> None: parser.add_argument("job_id", help="ID of the job to run") - parser.add_argument("job_args", nargs="*", help="Arguments to pass to the job") def handle(self, *args, **options): job_id = options["job_id"] @@ -18,7 +17,10 @@ def handle(self, *args, **options): for job_definition in settings.PERIODIC_RQ_JOBS: if job_definition["id"] == job_id: job_func = import_string(job_definition["func"]) - job_func(*options["job_args"]) + job_func( + *(job_definition.get("args", [])), + **(job_definition.get("kwargs", {})), + ) return raise CommandError(f"Job with ID {job_id} not found") diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index d45ce11a1194..ee30580cce0b 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3108,7 +3108,7 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.dataset_manager.cron import cleanup, clear_export_cache + from cvat.apps.dataset_manager.cron import cleanup_export_cache_directory, clear_export_cache self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin @@ -3127,7 +3127,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): ): mock_rq_job = mock.MagicMock(timeout=100) mock_rq_get_current_job.return_value = mock_rq_job - cleanup("cvat.apps.dataset_manager.cron.CleanupExportCacheThread") + cleanup_export_cache_directory() mock_clear_export_cache.assert_not_called() response = self._run_api_v2_tasks_id_export(task_id, user) @@ -3148,7 +3148,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): mock.patch('cvat.apps.dataset_manager.views.TASK_CACHE_TTL', new=timedelta(seconds=0)), mock.patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): - cleanup("cvat.apps.dataset_manager.cron.CleanupExportCacheThread") + cleanup_export_cache_directory() mock_clear_export_cache.assert_called_once() self.assertFalse(os.path.exists(file_path)) diff --git a/cvat/settings/base.py b/cvat/settings/base.py index 14e5db44c81d..ae2952c6849f 100644 --- a/cvat/settings/base.py +++ b/cvat/settings/base.py @@ -355,19 +355,17 @@ class CVAT_QUEUES(Enum): }, { 'queue': CVAT_QUEUES.CLEANING.value, - 'id': 'cron_export_cache_cleanup', - 'func': 'cvat.apps.dataset_manager.cron.cleanup', + 'id': 'cron_export_cache_directory_cleanup', + 'func': 'cvat.apps.dataset_manager.cron.cleanup_export_cache_directory', # Run twice a day (at midnight and at noon) 'cron_string': '0 0,12 * * *', - 'args': ('cvat.apps.dataset_manager.cron.CleanupExportCacheThread',), }, { 'queue': CVAT_QUEUES.CLEANING.value, 'id': 'cron_tmp_directory_cleanup', - 'func': 'cvat.apps.dataset_manager.cron.cleanup', + 'func': 'cvat.apps.dataset_manager.cron.cleanup_tmp_directory', # Run once a day 'cron_string': '0 18 * * *', - 'args': ('cvat.apps.dataset_manager.cron.CleanupTmpDirThread',), } ] diff --git a/dev/format_python_code.sh b/dev/format_python_code.sh index 6d4288ec1b4a..e18bb2b3e1eb 100755 --- a/dev/format_python_code.sh +++ b/dev/format_python_code.sh @@ -36,7 +36,7 @@ for paths in \ "cvat/apps/dataset_manager/tests/utils.py" \ "cvat/apps/events/signals.py" \ "cvat/apps/engine/management/commands/syncperiodicjobs.py" \ - "cvat/apps/dataset_manager/management/commands/exportcachecleanup.py" \ + "cvat/apps/dataset_manager/management/commands/cleanuplegacyexportcache.py" \ ; do ${BLACK} -- ${paths} ${ISORT} -- ${paths} diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 5ca168dcf62d..27cdd8cf3654 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -65,14 +65,14 @@ To clean up the outdated cache, run the command depending on how CVAT is deploye {{< tabpane lang="shell" >}} {{< tab header="Docker" >}} - docker exec -it cvat_server python manage.py exportcachecleanup + docker exec -it cvat_server python manage.py cleanuplegacyexportcache {{< /tab >}} {{< tab header="Kubernetes" >}} cvat_backend_pod=$(kubectl get pods -l component=server -o 'jsonpath={.items[0].metadata.name}') - kubectl exec -it ${cvat_backend_pod} -- python manage.py exportcachecleanup + kubectl exec -it ${cvat_backend_pod} -- python manage.py cleanuplegacyexportcache {{< /tab >}} {{< tab header="Development" >}} - python manage.py exportcachecleanup + python manage.py cleanuplegacyexportcache {{< /tab >}} {{< /tabpane >}} From d127636bd4fe7d53d4591bb4aab8c2419d95a013 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 13 Jan 2025 22:28:13 +0100 Subject: [PATCH 59/61] Fix isort issues && remove unused imports --- cvat/apps/dataset_manager/cron.py | 2 -- cvat/apps/engine/backup.py | 2 +- cvat/apps/engine/tests/test_rest_api.py | 5 ++++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index e15123ed80da..1de0af90a8a1 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -8,11 +8,9 @@ import shutil from abc import ABCMeta, abstractmethod from datetime import timedelta -from enum import Enum from functools import wraps from pathlib import Path from threading import Event, Thread -from time import sleep from typing import Callable, ClassVar, Type from django.conf import settings diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 41fbee2815d8..3d9e420c6a03 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -15,7 +15,7 @@ from enum import Enum from logging import Logger from tempfile import NamedTemporaryFile -from typing import Any, Optional, Type, Union, ClassVar +from typing import Any, ClassVar, Optional, Type, Union from zipfile import ZipFile import django_rq diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index ee30580cce0b..90240c5b8189 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3108,7 +3108,10 @@ def test_api_v2_tasks_id_export_no_auth(self): self._run_api_v2_tasks_id_export_import(None) def test_can_remove_export_cache_automatically_after_successful_export(self): - from cvat.apps.dataset_manager.cron import cleanup_export_cache_directory, clear_export_cache + from cvat.apps.dataset_manager.cron import ( + cleanup_export_cache_directory, + clear_export_cache, + ) self._create_tasks() task_id = self.tasks[0]["id"] user = self.admin From 6baf90bd3ff9925b9dda0a15868f7e09dd8a999b Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 14 Jan 2025 12:52:13 +0100 Subject: [PATCH 60/61] Simplify the code --- cvat/apps/dataset_manager/cron.py | 93 +++++-------------- .../tests/test_rest_api_formats.py | 5 - cvat/apps/engine/tests/test_rest_api.py | 5 - 3 files changed, 22 insertions(+), 81 deletions(-) diff --git a/cvat/apps/dataset_manager/cron.py b/cvat/apps/dataset_manager/cron.py index 1de0af90a8a1..278bbe945d51 100644 --- a/cvat/apps/dataset_manager/cron.py +++ b/cvat/apps/dataset_manager/cron.py @@ -8,14 +8,11 @@ import shutil from abc import ABCMeta, abstractmethod from datetime import timedelta -from functools import wraps from pathlib import Path -from threading import Event, Thread -from typing import Callable, ClassVar, Type +from typing import ClassVar, Type from django.conf import settings from django.utils import timezone -from rq import get_current_job from cvat.apps.dataset_manager.util import ( CacheFileOrDirPathParseError, @@ -34,17 +31,6 @@ logger = ServerLogManager(__name__).glob -def suppress_exceptions(func: Callable[[CleanupExportCacheThread], None]): - @wraps(func) - def wrapper(self: CleanupExportCacheThread): - try: - func(self) - except Exception as ex: - self.set_exception(ex) - - return wrapper - - def clear_export_cache(file_path: Path) -> bool: with get_export_cache_lock( file_path, @@ -64,36 +50,25 @@ def clear_export_cache(file_path: Path) -> bool: return True -class BaseCleanupThread(Thread, metaclass=ABCMeta): - description: ClassVar[str] +class BaseCleaner(metaclass=ABCMeta): + task_description: ClassVar[str] - def __init__(self, stop_event: Event, *args, **kwargs) -> None: - self._stop_event = stop_event + def __init__(self) -> None: self._number_of_removed_objects = 0 - self._exception = None - super().__init__(*args, **kwargs, target=self._cleanup) @property def number_of_removed_objects(self) -> int: return self._number_of_removed_objects @abstractmethod - def _cleanup(self) -> None: ... - - def set_exception(self, ex: Exception) -> None: - assert isinstance(ex, Exception) - self._exception = ex + def do_cleanup(self): + pass - def raise_if_exception(self) -> None: - if isinstance(self._exception, Exception): - raise self._exception +class TmpDirectoryCleaner(BaseCleaner): + task_description: ClassVar[str] = "common temporary directory cleanup" -class CleanupTmpDirThread(BaseCleanupThread): - description: ClassVar[str] = "common temporary directory cleanup" - - @suppress_exceptions - def _cleanup(self) -> None: + def do_cleanup(self) -> None: # we do not use locks here when handling objects from tmp directory # because undesired race conditions are not possible here: # 1. A temporary file/directory can be removed while checking access time. @@ -102,10 +77,6 @@ def _cleanup(self) -> None: # 3. Each temporary file/directory has a unique name, so the race condition when one process is creating an object # and another is removing it - impossible. for child in os.scandir(TmpDirManager.TMP_ROOT): - # stop clean up process correctly before rq job timeout is ended - if self._stop_event.is_set(): - return - try: if ( child.stat().st_atime @@ -125,19 +96,14 @@ def _cleanup(self) -> None: log_exception(logger) -class CleanupExportCacheThread(BaseCleanupThread): - description: ClassVar[str] = "export cache cleanup" +class ExportCacheDirectoryCleaner(BaseCleaner): + task_description: ClassVar[str] = "export cache directory cleanup" - @suppress_exceptions - def _cleanup(self) -> None: + def do_cleanup(self) -> None: export_cache_dir_path = settings.EXPORT_CACHE_ROOT assert os.path.exists(export_cache_dir_path) for child in os.scandir(export_cache_dir_path): - # stop clean up process correctly before rq job timeout is ended - if self._stop_event.is_set(): - return - # export cache directory is expected to contain only files if not child.is_file(): logger.warning(f"The {child.name} is not a file, skipping...") @@ -154,39 +120,24 @@ def _cleanup(self) -> None: log_exception(logger) -def cleanup(ThreadClass: Type[CleanupExportCacheThread | CleanupTmpDirThread]) -> None: - assert issubclass(ThreadClass, BaseCleanupThread) +def cleanup(CleanerClass: Type[ExportCacheDirectoryCleaner | TmpDirectoryCleaner]) -> None: + assert issubclass(CleanerClass, BaseCleaner) started_at = timezone.now() - stop_event = Event() - cleanup_thread = ThreadClass(stop_event=stop_event) - if rq_job := get_current_job(): - seconds_left = rq_job.timeout - 60 - assert seconds_left > 0 - - cleanup_thread.start() - cleanup_thread.join(timeout=seconds_left) - - if cleanup_thread.is_alive(): - stop_event.set() - cleanup_thread.join() - else: - # run func in the current thread - cleanup_thread.run() - - cleanup_thread.raise_if_exception() + cleaner = CleanerClass() + cleaner.do_cleanup() finished_at = timezone.now() logger.info( - f"The {cleanup_thread.description!r} process has been successfully " + f"The {cleaner.task_description!r} process has been successfully " f"completed after {int((finished_at - started_at).total_seconds())} seconds. " - f"{cleanup_thread.number_of_removed_objects} elements have been removed" + f"{cleaner.number_of_removed_objects} elements have been removed" ) -def cleanup_tmp_directory() -> None: - cleanup(CleanupTmpDirThread) +def cleanup_export_cache_directory() -> None: + cleanup(ExportCacheDirectoryCleaner) -def cleanup_export_cache_directory() -> None: - cleanup(CleanupExportCacheThread) +def cleanup_tmp_directory() -> None: + cleanup(TmpDirectoryCleaner) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index c112cae63110..0e0205bf69df 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -2100,12 +2100,7 @@ def _get_project_task_job_ids(): "cvat.apps.dataset_manager.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, - patch( - "cvat.apps.dataset_manager.cron.get_current_job", - ) as mock_rq_get_current_job, ): - mock_rq_job = MagicMock(timeout=100) - mock_rq_get_current_job.return_value = mock_rq_job cleanup_export_cache_directory() mock_clear_export_cache.assert_called_once() diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 90240c5b8189..4cd7ad254854 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -3124,12 +3124,7 @@ def test_can_remove_export_cache_automatically_after_successful_export(self): "cvat.apps.dataset_manager.cron.clear_export_cache", side_effect=clear_export_cache, ) as mock_clear_export_cache, - mock.patch( - "cvat.apps.dataset_manager.cron.get_current_job", - ) as mock_rq_get_current_job, ): - mock_rq_job = mock.MagicMock(timeout=100) - mock_rq_get_current_job.return_value = mock_rq_job cleanup_export_cache_directory() mock_clear_export_cache.assert_not_called() From d725af9151bfb49180bcab9adeee31981f2c0a4e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 14 Jan 2025 13:31:57 +0100 Subject: [PATCH 61/61] Revert documentation chnages --- .../administration/advanced/upgrade_guide.md | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/site/content/en/docs/administration/advanced/upgrade_guide.md b/site/content/en/docs/administration/advanced/upgrade_guide.md index 27cdd8cf3654..3462b20d28a5 100644 --- a/site/content/en/docs/administration/advanced/upgrade_guide.md +++ b/site/content/en/docs/administration/advanced/upgrade_guide.md @@ -56,28 +56,6 @@ To upgrade CVAT, follow these steps: docker logs cvat_server -f ``` -## Upgrade CVAT after v2.25.0 - -In version 2.25.0, CVAT changed the location where the export cache is stored. -To clean up the outdated cache, run the command depending on how CVAT is deployed: - - - -{{< tabpane lang="shell" >}} - {{< tab header="Docker" >}} - docker exec -it cvat_server python manage.py cleanuplegacyexportcache - {{< /tab >}} - {{< tab header="Kubernetes" >}} - cvat_backend_pod=$(kubectl get pods -l component=server -o 'jsonpath={.items[0].metadata.name}') - kubectl exec -it ${cvat_backend_pod} -- python manage.py cleanuplegacyexportcache - {{< /tab >}} - {{< tab header="Development" >}} - python manage.py cleanuplegacyexportcache - {{< /tab >}} -{{< /tabpane >}} - - - ## How to upgrade CVAT from v2.2.0 to v2.3.0. Step by step commands how to upgrade CVAT from v2.2.0 to v2.3.0.