From 7e57cbb65dd6b01805b4613eb5ca9e252d67ad85 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 21 Jan 2025 13:25:42 +0300 Subject: [PATCH 1/9] backup for cloud-based tasks/projects --- cvat/apps/engine/backup.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 20cd81e0d706..0d5b846f3ec6 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -8,6 +8,7 @@ import os import re import shutil +import tempfile import uuid from abc import ABCMeta, abstractmethod from collections.abc import Collection, Iterable @@ -46,7 +47,7 @@ retry_current_rq_job, ) from cvat.apps.engine import models -from cvat.apps.engine.cloud_provider import import_resource_from_cloud_storage +from cvat.apps.engine.cloud_provider import db_storage_to_storage_instance, import_resource_from_cloud_storage from cvat.apps.engine.location import StorageType, get_location_configuration from cvat.apps.engine.log import ServerLogManager from cvat.apps.engine.models import ( @@ -439,8 +440,29 @@ def _write_data(self, zip_object, target_dir=None): files=[self._db_data.get_manifest_path()], target_dir=target_data_dir, ) + elif self._db_data.storage == StorageChoice.CLOUD_STORAGE: + assert not hasattr(self._db_data, 'video'), "Only images can be stored in cloud storage" + media_files = [im.path for im in self._db_data.images.all()] + cloud_storage_instance = db_storage_to_storage_instance(self._db_data.cloud_storage) + with tempfile.TemporaryDirectory() as tmp_dir: + cloud_storage_instance.bulk_download_to_dir(files=media_files, upload_dir=tmp_dir) + self._write_files( + source_dir=tmp_dir, + zip_object=zip_object, + files=[ + os.path.join(tmp_dir, file) + for file in media_files + ], + target_dir=target_data_dir, + ) + self._write_files( + source_dir=self._db_data.get_upload_dirname(), + zip_object=zip_object, + files=[self._db_data.get_manifest_path()], + target_dir=target_data_dir, + ) else: - raise NotImplementedError("We don't currently support backing up tasks with data from cloud storage") + raise NotImplementedError def _write_task(self, zip_object, target_dir=None): task_dir = self._db_task.get_dirname() @@ -539,6 +561,9 @@ def serialize_data(): ] data['validation_layout'] = validation_params + if self._db_data.storage == StorageChoice.CLOUD_STORAGE: + data["storage"] = StorageChoice.LOCAL + return self._prepare_data_meta(data) task = serialize_task() From 42ff71630bcdb5e68d27cfd667428bce822c0c4a Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 21 Jan 2025 15:20:40 +0300 Subject: [PATCH 2/9] test can export and import --- tests/python/rest_api/test_tasks.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index f1ad80f62639..9111c841e802 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -3959,6 +3959,33 @@ def test_cannot_export_backup_for_task_without_data(self, tasks): assert exc.status == HTTPStatus.BAD_REQUEST assert "Backup of a task without data is not allowed" == exc.body.encode() + def test_can_export_and_import_backup_task_with_cloud_storage(self, tasks): + cloud_storage_content = ["image_case_65_1.png", "image_case_65_2.png"] + task_spec = { + "name": "Task with files from cloud storage", + "labels": [ + { + "name": "car", + } + ], + } + data_spec = { + "image_quality": 75, + "use_cache": False, + "cloud_storage_id": 1, + "server_files": cloud_storage_content, + } + task_id, _ = create_task(self.user, task_spec, data_spec) + + task = self.client.tasks.retrieve(task_id) + + filename = self.tmp_dir / f"cloud_task_{task.id}_backup.zip" + task.download_backup(filename) + + assert filename.is_file() + assert filename.stat().st_size > 0 + self._test_can_restore_task_from_backup(task_id) + @pytest.mark.parametrize("mode", ["annotation", "interpolation"]) def test_can_import_backup(self, tasks, mode): task_id = next(t for t in tasks if t["mode"] == mode)["id"] From 1244a4189a4e50e10c16e1a02b1cbdb151c3fcfa Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 21 Jan 2025 16:02:29 +0300 Subject: [PATCH 3/9] fix isort --- cvat/apps/engine/backup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 0d5b846f3ec6..0b5d02a994d3 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -47,7 +47,10 @@ retry_current_rq_job, ) from cvat.apps.engine import models -from cvat.apps.engine.cloud_provider import db_storage_to_storage_instance, import_resource_from_cloud_storage +from cvat.apps.engine.cloud_provider import ( + db_storage_to_storage_instance, + import_resource_from_cloud_storage, +) from cvat.apps.engine.location import StorageType, get_location_configuration from cvat.apps.engine.log import ServerLogManager from cvat.apps.engine.models import ( From 6855860884c5233e67cba99f3657b7b7eb2de25f Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 4 Feb 2025 10:52:43 +0400 Subject: [PATCH 4/9] backup tests for media in cloud --- cvat/apps/engine/tests/test_rest_api.py | 141 +++++++++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 72b291bd71f0..7fa6929f45ff 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -31,6 +31,7 @@ from django.conf import settings from django.contrib.auth.models import Group, User from django.http import HttpResponse +from django.test import override_settings from pdf2image import convert_from_bytes from PIL import Image from pycocotools import coco as coco_loader @@ -42,6 +43,7 @@ from cvat.apps.dataset_manager.tests.utils import TestDir from cvat.apps.dataset_manager.util import current_function_name +from cvat.apps.engine.cloud_provider import AWS_S3, Status from cvat.apps.engine.media_extractors import ValidateDimension, sort from cvat.apps.engine.models import ( AttributeSpec, @@ -1317,6 +1319,7 @@ def test_api_v2_projects_id_tasks_no_auth(self): response = self._run_api_v2_projects_id_tasks(None, project.id) self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) + class ProjectBackupAPITestCase(ApiTestBase): @classmethod def setUpTestData(cls): @@ -1633,6 +1636,12 @@ def _run_api_v2_projects_id(self, pid, user): return response.data + def _get_tasks_for_project(self, user, pid): + with ForceLogin(user, self.client): + response = self.client.get('/api/tasks?project_id={}'.format(pid)) + + return sorted(response.data["results"], key=lambda task: task["name"]) + def _run_api_v2_projects_id_export_import(self, user): for project in self.projects: if user: @@ -1669,7 +1678,7 @@ def _run_api_v2_projects_id_export_import(self, user): } response = self._run_api_v2_projects_import(user, uploaded_data) self.assertEqual(response.status_code, HTTP_202_ACCEPTED) - if response.status_code == status.HTTP_200_OK: + if response.status_code == status.HTTP_202_ACCEPTED: rq_id = response.data["rq_id"] response = self._run_api_v2_projects_import(user, {"rq_id": rq_id}) self.assertEqual(response.status_code, HTTP_201_CREATED) @@ -1691,6 +1700,26 @@ def _run_api_v2_projects_id_export_import(self, user): "tasks", ), ) + self.assertEqual(original_project["tasks"]["count"], imported_project["tasks"]["count"]) + original_tasks = self._get_tasks_for_project(user, original_project["id"]) + imported_tasks = self._get_tasks_for_project(user, imported_project["id"]) + for original_task, imported_task in zip(original_tasks, imported_tasks): + compare_objects( + self=self, + obj1=original_task, + obj2=imported_task, + ignore_keys=( + "id", + "url", + "created_date", + "updated_date", + "username", + "project_id", + "data", + # backup does not have this info for some reason + "overlap", + ), + ) def test_api_v2_projects_id_export_admin(self): self._run_api_v2_projects_id_export_import(self.admin) @@ -1704,6 +1733,116 @@ def test_api_v2_projects_id_export_somebody(self): def test_api_v2_projects_id_export_no_auth(self): self._run_api_v2_projects_id_export_import(None) + +@override_settings(MEDIA_CACHE_ALLOW_STATIC_CACHE=False) +class ProjectCloudBackupAPINoStaticChunksTestCase(ProjectBackupAPITestCase): + @classmethod + def setUpTestData(cls): + create_db_users(cls) + cls.client = APIClient() + cls._create_cloud_storage() + cls._create_media() + cls._create_projects() + + @classmethod + def _create_cloud_storage(cls): + data = { + "provider_type": "AWS_S3_BUCKET", + "resource": "test", + "display_name": "Bucket", + "credentials_type": "KEY_SECRET_KEY_PAIR", + "key": "minio_access_key", + "secret_key": "minio_secret_key", + "specific_attributes": "endpoint_url=http://minio:9000", + "description": "Some description", + "manifests": [], + } + + class MockAWS(AWS_S3): + _files = {} + + def get_status(self): + return Status.AVAILABLE + + @classmethod + def create_file(cls, key, bytes): + cls._files[key] = bytes + + def get_file_status(self, key): + return Status.AVAILABLE if key in self._files else Status.NOT_FOUND + + def _download_range_of_bytes(self, key, stop_byte, start_byte): + return self._files[key][start_byte:stop_byte] + + def _download_fileobj_to_stream(self, key, stream): + stream.write(self._files[key]) + + cls.mock_aws = MockAWS + + cls.aws_patch = mock.patch("cvat.apps.engine.cloud_provider.AWS_S3", MockAWS) + cls.aws_patch.start() + + with ForceLogin(cls.owner, cls.client): + response = cls.client.post('/api/cloudstorages', data=data, format="json") + assert response.status_code == status.HTTP_201_CREATED, (response.status_code, response.content) + cls.cloud_storage_id = response.json()["id"] + + @classmethod + def tearDownClass(cls): + cls.aws_patch.stop() + super().tearDownClass() + + @classmethod + def _create_media(cls): + cls.media_data = [] + cls.media = {'files': [], 'dirs': []} + for file in [ + generate_random_image_file("test_1.jpg")[1], + generate_random_image_file("test_2.jpg")[1], + generate_pdf_file("test_pdf_1.pdf", 7)[1], + generate_zip_archive_file("test_archive_1.zip", 10)[1], + generate_video_file("test_video.mp4")[1], + ]: + cls.mock_aws.create_file(file.name, file.getvalue()) + + cls.media_data.extend([ + # image list cloud + { + "server_files[0]": "test_1.jpg", + "server_files[1]": "test_2.jpg", + "image_quality": 75, + "cloud_storage_id": cls.cloud_storage_id, + "storage": StorageChoice.CLOUD_STORAGE, + }, + # video cloud + { + "server_files[0]": "test_video.mp4", + "image_quality": 75, + "cloud_storage_id": cls.cloud_storage_id, + "storage": StorageChoice.CLOUD_STORAGE, + }, + # zip archive cloud + { + "server_files[0]": "test_archive_1.zip", + "image_quality": 50, + "cloud_storage_id": cls.cloud_storage_id, + "storage": StorageChoice.CLOUD_STORAGE, + }, + # pdf cloud + { + "server_files[0]": "test_pdf_1.pdf", + "image_quality": 54, + "cloud_storage_id": cls.cloud_storage_id, + "storage": StorageChoice.CLOUD_STORAGE, + }, + ]) + + +@override_settings(MEDIA_CACHE_ALLOW_STATIC_CACHE=True) +class ProjectCloudBackupAPIStaticChunksTestCase(ProjectCloudBackupAPINoStaticChunksTestCase): + pass + + class ProjectExportAPITestCase(ApiTestBase): @classmethod def setUpTestData(cls): From 9014a2d4fa20f4711b6f18483759373415f45511 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 4 Feb 2025 11:10:52 +0400 Subject: [PATCH 5/9] fixing pylint --- cvat/apps/engine/tests/test_rest_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py index 7fa6929f45ff..73c2f0c0322d 100644 --- a/cvat/apps/engine/tests/test_rest_api.py +++ b/cvat/apps/engine/tests/test_rest_api.py @@ -1765,8 +1765,8 @@ def get_status(self): return Status.AVAILABLE @classmethod - def create_file(cls, key, bytes): - cls._files[key] = bytes + def create_file(cls, key, _bytes): + cls._files[key] = _bytes def get_file_status(self, key): return Status.AVAILABLE if key in self._files else Status.NOT_FOUND From 82c92f2c8a1eac3f4ff7f95f1077e39ea7c8ae9b Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 4 Feb 2025 15:15:15 +0400 Subject: [PATCH 6/9] fixing bug --- cvat/apps/engine/media_extractors.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py index 9bee5fbee4cb..77bc6dd8464b 100644 --- a/cvat/apps/engine/media_extractors.py +++ b/cvat/apps/engine/media_extractors.py @@ -184,14 +184,17 @@ def __getitem__(self, idx: int): value = super().__getitem__(idx) value_size = self._get_object_size(value) - while ( - len(self._cache) + 1 > self.max_cache_entries or - self.used_cache_memory + value_size > self.max_cache_memory - ): + def can_put_item_in_cache(): + return ( + len(self._cache) + 1 <= self.max_cache_entries and + self.used_cache_memory + value_size <= self.max_cache_memory + ) + + while len(self._cache) > 0 and not can_put_item_in_cache(): min_key = min(self._cache.keys()) self._cache.pop(min_key) - if self.used_cache_memory + value_size <= self.max_cache_memory: + if can_put_item_in_cache(): self._cache[idx] = self._CacheItem(value, value_size) return value From eda7e442d5d6dce30966e8652877ca91234517b5 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 4 Feb 2025 15:21:37 +0400 Subject: [PATCH 7/9] adding asserts for cloud backup --- cvat/apps/engine/backup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 1b1e1298f299..49544d90aa77 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -399,14 +399,14 @@ class TaskExporter(_ExporterBase, _TaskBackupBase): def __init__(self, pk, version=Version.V1): super().__init__(logger=slogger.task[pk]) - self._db_task = ( + self._db_task: models.Task = ( models.Task.objects .prefetch_related('data__images', 'annotation_guide__assets') .select_related('data__video', 'data__validation_layout', 'annotation_guide') .get(pk=pk) ) - self._db_data = self._db_task.data + self._db_data: models.Data = self._db_task.data self._version = version db_labels = (self._db_task.project if self._db_task.project_id else self._db_task).label_set.all().prefetch_related( @@ -448,7 +448,9 @@ def _write_data(self, zip_object, target_dir=None): target_dir=target_data_dir, ) elif self._db_data.storage == StorageChoice.CLOUD_STORAGE: + assert self._db_task.dimension != models.DimensionType.DIM_3D, "Cloud storage cannot contain 3d images" assert not hasattr(self._db_data, 'video'), "Only images can be stored in cloud storage" + assert self._db_data.related_files.count() == 0, "No related images can be stored in cloud storage" media_files = [im.path for im in self._db_data.images.all()] cloud_storage_instance = db_storage_to_storage_instance(self._db_data.cloud_storage) with tempfile.TemporaryDirectory() as tmp_dir: From ee53e6444f207ec8fe784787d95f4abbbab8f78c Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Tue, 4 Feb 2025 15:39:23 +0400 Subject: [PATCH 8/9] changelog entry --- ...0250204_153709_dmitrii.lavrukhin_backup_task_from_cloud.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog.d/20250204_153709_dmitrii.lavrukhin_backup_task_from_cloud.md diff --git a/changelog.d/20250204_153709_dmitrii.lavrukhin_backup_task_from_cloud.md b/changelog.d/20250204_153709_dmitrii.lavrukhin_backup_task_from_cloud.md new file mode 100644 index 000000000000..92a35816b5fc --- /dev/null +++ b/changelog.d/20250204_153709_dmitrii.lavrukhin_backup_task_from_cloud.md @@ -0,0 +1,4 @@ +### Added + +- Tasks created from cloud storage can be backed up now + () From 46534f150f939e8ede1b330da4f9bea4816de65e Mon Sep 17 00:00:00 2001 From: Maxim Zhiltsov Date: Tue, 4 Feb 2025 17:05:28 +0300 Subject: [PATCH 9/9] Update tests/python/rest_api/test_tasks.py --- tests/python/rest_api/test_tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/rest_api/test_tasks.py b/tests/python/rest_api/test_tasks.py index a1559edee8fc..5bf870efba1d 100644 --- a/tests/python/rest_api/test_tasks.py +++ b/tests/python/rest_api/test_tasks.py @@ -4095,6 +4095,7 @@ def test_cannot_export_backup_for_task_without_data(self, tasks): assert exc.status == HTTPStatus.BAD_REQUEST assert "Backup of a task without data is not allowed" == exc.body.encode() + @pytest.mark.with_external_services def test_can_export_and_import_backup_task_with_cloud_storage(self, tasks): cloud_storage_content = ["image_case_65_1.png", "image_case_65_2.png"] task_spec = {