Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ms/task creation performance fixes #8741

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog.d/20241126_094511_msarniak_fix_creation_time.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Fixed

- Task creation performance for > 50k frames jobs
(<https://github.com/cvat-ai/cvat/pull/8741>)
3 changes: 2 additions & 1 deletion cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def __init__(
dimension: DimensionType = DimensionType.DIM_2D
):
self._source_path = source_path
self._source_path_set = set(source_path) if source_path else set()

self._step = step

Expand Down Expand Up @@ -307,7 +308,7 @@ def __iter__(self):
yield (self.get_image(i), self.get_path(i), i)

def __contains__(self, media_file):
return media_file in self._source_path
return media_file in self._source_path_set

def filter(self, callback):
source_path = list(filter(callback, self._source_path))
Expand Down
11 changes: 9 additions & 2 deletions cvat/apps/engine/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,11 @@ def _count_files(data):
if not os.path.dirname(v[0]).startswith(v[1])]

# we need to keep the original sequence of files
data['server_files'] = [f for f in server_files if f in without_extra_dirs]
# Convert without_extra_dirs to a set for O(1) lookups
without_extra_dirs_set = set(without_extra_dirs) if without_extra_dirs else set()

# Filter server_files based on whether they exist in the set
data['server_files'] = [f for f in server_files if f in without_extra_dirs_set]

def count_files(file_mapping, counter):
for rel_path, full_path in file_mapping.items():
Expand Down Expand Up @@ -705,9 +709,12 @@ def _update_status(msg: str) -> None:

# We only need to process the files specified in job_file_mapping
if job_file_mapping is not None:
# Convert data['server_files'] to a set for O(1) membership checks
server_files_set = set(data['server_files']) if data['server_files'] else set()

filtered_files = []
for f in itertools.chain.from_iterable(job_file_mapping):
if f not in data['server_files']:
if f not in server_files_set:
raise ValidationError(f"Job mapping file {f} is not specified in input files")
filtered_files.append(f)
data['server_files'] = filtered_files
Expand Down
8 changes: 6 additions & 2 deletions utils/dataset_manifest/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,10 +641,14 @@ def data(self):
def get_subset(self, subset_names):
index_list = []
subset = []
# First, create a dictionary mapping image names to their indices
name_to_index = {name: index for index, name in enumerate(subset_names)} if subset_names else {}

# Now, loop through the images and check against the dictionary
for _, image in self:
image_name = f"{image.full_name}"
if image_name in subset_names:
index_list.append(subset_names.index(image_name))
if image_name in name_to_index:
index_list.append(name_to_index[image_name])
properties = {
'name': f"{image['name']}",
'extension': f"{image['extension']}",
Expand Down
Loading