From bd5063025fda28c070cd86f04295d41838c9002d Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 Jan 2025 09:33:52 +0100 Subject: [PATCH 1/8] Add report of duplicates resources ids --- udata/commands/db.py | 122 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/udata/commands/db.py b/udata/commands/db.py index e894f4f78a..98f312d338 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -12,6 +12,7 @@ from udata import models as core_models from udata.api import oauth2 as oauth2_models from udata.commands import cli, cyan, echo, green, magenta, red, white, yellow +from udata.core.dataset.models import Dataset from udata.harvest import models as harvest_models from udata.mongo import db @@ -394,3 +395,124 @@ def print_and_save(text: str): def check_integrity(models): """Check the integrity of the database from a business perspective""" check_references(models) + + +@grp.command() +@click.option( + "-did", + "--duplicate-inside-dataset", + is_flag=True, + help="Show duplicates inside the same dataset (same resource ID inside one dataset)", +) +@click.option( + "-dod", + "--duplicate-outside-dataset", + is_flag=True, + help="Show duplicates outside (same resource ID shared between datasets)", +) +@click.option( + "-emf", + "--exclude-meteo-france", + is_flag=True, + help="Exclude Météo France datasets", +) +@click.option( + "-omf", + "--only-meteo-france", + is_flag=True, + help="Only Météo France datasets", +) +def check_duplicate_resources_ids( + duplicate_inside_dataset, duplicate_outside_dataset, exclude_meteo_france, only_meteo_france +): + resources = {} + + with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets: + for dataset in datasets: + for resource in dataset.resources: + if resource.id not in resources: + resources[resource.id] = {"resources": [], "datasets": set()} + resources[resource.id]["resources"].append(resource) + resources[resource.id]["datasets"].add(dataset) + + resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1} + + if duplicate_inside_dataset: + count_resources = 0 + count_datasets = 0 + for id, info in resources.items(): + if len(info["datasets"]) == 1: + continue + + # Filter out meteo france + if ( + exclude_meteo_france + and list(info["datasets"])[0].organization + and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" + ): + continue + + # Filter everything except meteo france + if only_meteo_france and ( + not list(info["datasets"])[0].organization + or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" + ): + continue + + count = len(info["resources"]) + print(f"With ID {id}: {count} resources") + for dataset in info["datasets"]: + count_datasets += 1 + print(f"\t- Dataset#{dataset.id} {dataset.title}") + print("") + for resource in info["resources"]: + count_resources += 1 + print(f"\t- Resource {resource.title}") + print() + print("---") + print("---") + print("---") + print() + + print(f"Resources with duplicated IDs: {count_resources}") + print(f"Datasets concerned {count_datasets}") + + if duplicate_outside_dataset: + count_resources = 0 + count_datasets = 0 + for id, info in resources.items(): + if len(info["datasets"]) > 1: + continue + + # Filter out meteo france + if ( + exclude_meteo_france + and list(info["datasets"])[0].organization + and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" + ): + continue + + # Filter everything except meteo france + if only_meteo_france and ( + not list(info["datasets"])[0].organization + or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" + ): + continue + + count = len(info["resources"]) + print(f"With ID {id}: {count} resources") + for dataset in info["datasets"]: + count_datasets += 1 + print(f"\t- Dataset#{dataset.id} {dataset.title}") + print("") + for resource in info["resources"]: + count_resources += 1 + print(f"\t- Resource {resource.title}") + print() + print("---") + print("---") + print("---") + print() + + print(f"Resources with duplicated IDs: {count_resources}") + print(f"Datasets concerned {count_datasets}") From c42a8a2a2f8cb31642d728ce63b835d2f3d406f1 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 Jan 2025 09:40:22 +0100 Subject: [PATCH 2/8] Fix inversion and add changelog --- CHANGELOG.md | 2 +- udata/commands/db.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b8d5b96fe..abcf51c74c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Current (in progress) -- Nothing yet +- Add report of duplicates resources ids [#3247](https://github.com/opendatateam/udata/pull/3247) ## 10.0.7 (2025-01-13) diff --git a/udata/commands/db.py b/udata/commands/db.py index 98f312d338..74806cfddf 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -437,7 +437,7 @@ def check_duplicate_resources_ids( resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1} - if duplicate_inside_dataset: + if duplicate_outside_dataset: count_resources = 0 count_datasets = 0 for id, info in resources.items(): @@ -477,7 +477,7 @@ def check_duplicate_resources_ids( print(f"Resources with duplicated IDs: {count_resources}") print(f"Datasets concerned {count_datasets}") - if duplicate_outside_dataset: + if duplicate_inside_dataset: count_resources = 0 count_datasets = 0 for id, info in resources.items(): From 23b9edb180588dfa9ca8bc7a54cf7aa71e33c395 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Tue, 14 Jan 2025 16:28:11 +0100 Subject: [PATCH 3/8] Add checksum info --- udata/commands/db.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index 74806cfddf..aded9eb084 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -12,7 +12,7 @@ from udata import models as core_models from udata.api import oauth2 as oauth2_models from udata.commands import cli, cyan, echo, green, magenta, red, white, yellow -from udata.core.dataset.models import Dataset +from udata.core.dataset.models import Dataset, Resource from udata.harvest import models as harvest_models from udata.mongo import db @@ -427,6 +427,15 @@ def check_duplicate_resources_ids( ): resources = {} + def get_additional_info(resource: Resource): + if resource.checksum: + return f" ({resource.checksum.type} {resource.checksum.value} / {resource.url})" + + if "analysis:checksum" in resource.extras: + return f" ({resource.extras['analysis:checksum']} / {resource.url})" + + return f" ({resource.url})" + with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets: for dataset in datasets: for resource in dataset.resources: @@ -467,7 +476,7 @@ def check_duplicate_resources_ids( print("") for resource in info["resources"]: count_resources += 1 - print(f"\t- Resource {resource.title}") + print(f"\t- Resource {resource.title}{get_additional_info(resource)}") print() print("---") print("---") @@ -507,7 +516,7 @@ def check_duplicate_resources_ids( print("") for resource in info["resources"]: count_resources += 1 - print(f"\t- Resource {resource.title}") + print(f"\t- Resource {resource.title}{get_additional_info(resource)}") print() print("---") print("---") From d0601342598d8183263b5fcbbf4fb7fff8969d8f Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 22 Jan 2025 16:14:20 +0100 Subject: [PATCH 4/8] Add --fix to command --- udata/commands/db.py | 213 ++++++++++++++++++++++++++----------------- 1 file changed, 127 insertions(+), 86 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index aded9eb084..3cf0278843 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -1,8 +1,10 @@ import collections +import copy import logging import os import traceback from itertools import groupby +from uuid import uuid4 import click import mongoengine @@ -292,7 +294,7 @@ def print_and_save(text: str): print("Those references will be inspected:") for reference in references: - print(f'- {reference["repr"]}({reference["destination"]}) — {reference["type"]}') + print(f"- {reference['repr']}({reference['destination']}) — {reference['type']}") print("") total = 0 @@ -306,7 +308,7 @@ def print_and_save(text: str): with click.progressbar(qs, length=count) as models: for obj in models: for reference in model_references: - key = f'\t- {reference["repr"]}({reference["destination"]}) — {reference["type"]}…' + key = f"\t- {reference['repr']}({reference['destination']}) — {reference['type']}…" if key not in errors[model]: errors[model][key] = 0 @@ -317,7 +319,7 @@ def print_and_save(text: str): except mongoengine.errors.DoesNotExist: errors[model][key] += 1 print_and_save( - f'\t{model.__name__}#{obj.id} have a broken reference for `{reference["name"]}`' + f"\t{model.__name__}#{obj.id} have a broken reference for `{reference['name']}`" ) elif reference["type"] == "list": attr_list = getattr(obj, reference["name"], []) @@ -327,7 +329,7 @@ def print_and_save(text: str): if isinstance(sub, DBRef): errors[model][key] += 1 print_and_save( - f'\t{model.__name__}#{obj.id} have a broken reference for {reference["name"]}[{i}]' + f"\t{model.__name__}#{obj.id} have a broken reference for {reference['name']}[{i}]" ) elif reference["type"] == "embed_list": p1, p2 = reference["name"].split("__") @@ -367,7 +369,7 @@ def print_and_save(text: str): f"\t{model.__name__}#{obj.id} have a broken reference for {p1}.{p2}[{i}]" ) else: - print_and_save(f'Unknown ref type {reference["type"]}') + print_and_save(f"Unknown ref type {reference['type']}") except mongoengine.errors.FieldDoesNotExist: print_and_save( f"[ERROR for {model.__name__} {obj.id}] {traceback.format_exc()}" @@ -422,106 +424,145 @@ def check_integrity(models): is_flag=True, help="Only Météo France datasets", ) +@click.option( + "-f", + "--fix", + is_flag=True, + help="Auto-fix some problems", +) def check_duplicate_resources_ids( - duplicate_inside_dataset, duplicate_outside_dataset, exclude_meteo_france, only_meteo_france + duplicate_inside_dataset, + duplicate_outside_dataset, + exclude_meteo_france, + only_meteo_france, + fix, ): resources = {} - def get_additional_info(resource: Resource): + def get_checksum_value(resource: Resource): if resource.checksum: - return f" ({resource.checksum.type} {resource.checksum.value} / {resource.url})" + return resource.checksum.value if "analysis:checksum" in resource.extras: - return f" ({resource.extras['analysis:checksum']} / {resource.url})" + return resource.extras["analysis:checksum"] - return f" ({resource.url})" + return None - with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets: + with click.progressbar( + Dataset.objects, + Dataset.objects().count(), + ) as datasets: for dataset in datasets: for resource in dataset.resources: if resource.id not in resources: resources[resource.id] = {"resources": [], "datasets": set()} + resources[resource.id]["resources"].append(resource) resources[resource.id]["datasets"].add(dataset) resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1} - if duplicate_outside_dataset: - count_resources = 0 - count_datasets = 0 - for id, info in resources.items(): - if len(info["datasets"]) == 1: - continue + count_resources = 0 + count_datasets = 0 + for id, info in resources.items(): + if len(info["datasets"]) == 1 and not duplicate_inside_dataset: + continue - # Filter out meteo france - if ( - exclude_meteo_france - and list(info["datasets"])[0].organization - and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" - ): - continue + if len(info["datasets"]) > 1 and not duplicate_outside_dataset: + continue - # Filter everything except meteo france - if only_meteo_france and ( - not list(info["datasets"])[0].organization - or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" - ): - continue - - count = len(info["resources"]) - print(f"With ID {id}: {count} resources") - for dataset in info["datasets"]: - count_datasets += 1 - print(f"\t- Dataset#{dataset.id} {dataset.title}") - print("") - for resource in info["resources"]: - count_resources += 1 - print(f"\t- Resource {resource.title}{get_additional_info(resource)}") - print() - print("---") - print("---") - print("---") - print() - - print(f"Resources with duplicated IDs: {count_resources}") - print(f"Datasets concerned {count_datasets}") - - if duplicate_inside_dataset: - count_resources = 0 - count_datasets = 0 - for id, info in resources.items(): - if len(info["datasets"]) > 1: - continue - - # Filter out meteo france - if ( - exclude_meteo_france - and list(info["datasets"])[0].organization - and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" - ): - continue + # Filter out meteo france + if ( + exclude_meteo_france + and list(info["datasets"])[0].organization + and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" + ): + continue + + # Filter everything except meteo france + if only_meteo_france and ( + not list(info["datasets"])[0].organization + or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" + ): + continue + + count = len(info["resources"]) + print(f"With ID {id}: {count} resources") + for dataset in info["datasets"]: + count_datasets += 1 + print(f"\t- Dataset#{dataset.id} {dataset.title}") + print("") + for resource in info["resources"]: + count_resources += 1 + print( + f"\t- Resource {resource.title} ({get_checksum_value(resource)} / {resource.url})" + ) - # Filter everything except meteo france - if only_meteo_france and ( - not list(info["datasets"])[0].organization - or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" + print() + + if len(info["datasets"]) == 1 and len(info["resources"]) == 2: + dataset = next(iter(info["datasets"])) + + resource1 = info["resources"][0] + resource2 = info["resources"][1] + + new_resources = [] + highlight_ids = [id] + if ( + get_checksum_value(resource1) == get_checksum_value(resource2) + and resource1.url == resource2.url ): - continue - - count = len(info["resources"]) - print(f"With ID {id}: {count} resources") - for dataset in info["datasets"]: - count_datasets += 1 - print(f"\t- Dataset#{dataset.id} {dataset.title}") - print("") - for resource in info["resources"]: - count_resources += 1 - print(f"\t- Resource {resource.title}{get_additional_info(resource)}") - print() - print("---") - print("---") - print("---") - print() - - print(f"Resources with duplicated IDs: {count_resources}") - print(f"Datasets concerned {count_datasets}") + print( + "Since checksum and URL are the same, fixing by removing the second resource…\n" + ) + + for r in dataset.resources: + # If it's the duplicated resource we're interested in and + # that ID was already added to the new_resources (so we are + # on the second resource), do not add it. + if r.id == id and id in [r.id for r in new_resources]: + continue + + new_resources.append(r) + else: + print( + "Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n" + ) + + for r in dataset.resources: + # If it's the duplicated resource we're interested in and + # that ID was already added to the new_resources (so we are + # on the second resource), generate a new UUID + if r.id == id and id in [r.id for r in new_resources]: + # Just for logging we copy the resource to avoid changing the ID + # on the original resource (and have a clear compare at the end) + new_r = copy.deepcopy(r) + new_r.id = uuid4() + highlight_ids.append(new_r.id) + + new_resources.append(new_r) + else: + new_resources.append(r) + + print(f"Previous resources ({len(dataset.resources)})") + for r in dataset.resources: + highlight = " <---- CHANGED !" if r.id in highlight_ids else "" + print(f"\t{r.id} {r.title} {highlight}") + + print(f"New resources ({len(new_resources)})") + for r in new_resources: + highlight = " <---- CHANGED !" if r.id in highlight_ids else "" + print(f"\t{r.id} {r.title} {highlight}") + + if fix: + dataset.resources = new_resources + dataset.save() + + print() + print("---") + print("---") + print("---") + print() + + print(f"Resources with duplicated IDs: {count_resources}") + print(f"Datasets concerned {count_datasets}") From 55bc3bc30716781cbc1110a4ef98963a07b9a2aa Mon Sep 17 00:00:00 2001 From: maudetes Date: Tue, 28 Jan 2025 17:49:55 +0100 Subject: [PATCH 5/8] Fix typo during merge conflict --- udata/commands/db.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index d4bde09e00..c9a2aa904b 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -21,8 +21,7 @@ log = logging.getLogger(__name__) -@cli.group(" - ") +@cli.group("db") def grp(): """Database related operations""" pass From aa508b1db8a923b0d456b0b3eaad9c54bc982f3e Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 Jan 2025 10:25:50 +0100 Subject: [PATCH 6/8] Apply suggestions from code review Co-authored-by: maudetes --- udata/commands/db.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index c9a2aa904b..d2f804490d 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -432,10 +432,7 @@ def get_checksum_value(resource: Resource): if resource.checksum: return resource.checksum.value - if "analysis:checksum" in resource.extras: - return resource.extras["analysis:checksum"] - - return None + return resource.extras.get("analysis:checksum") with click.progressbar( Dataset.objects, @@ -449,6 +446,7 @@ def get_checksum_value(resource: Resource): resources[resource.id]["resources"].append(resource) resources[resource.id]["datasets"].add(dataset) + # Keep duplicated resources only resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1} count_resources = 0 From 5b83e5caf8a5f092b029b667905c9ebb9be02eaa Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 Jan 2025 10:32:20 +0100 Subject: [PATCH 7/8] Some arguments changes --- udata/commands/db.py | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index d2f804490d..048e9ab11b 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -4,6 +4,7 @@ import os import traceback from itertools import groupby +from typing import Optional from uuid import uuid4 import click @@ -390,28 +391,26 @@ def check_integrity(models): @grp.command() @click.option( - "-did", - "--duplicate-inside-dataset", + "-sdid", + "--skip-duplicates-inside-dataset", is_flag=True, - help="Show duplicates inside the same dataset (same resource ID inside one dataset)", + help="Do not show duplicates inside the same dataset (same resource ID inside one dataset)", ) @click.option( - "-dod", - "--duplicate-outside-dataset", + "-sdod", + "--skip-duplicates-outside-dataset", is_flag=True, - help="Show duplicates outside (same resource ID shared between datasets)", + help="Do not show duplicates between datasets (same resource ID shared between datasets)", ) @click.option( - "-emf", - "--exclude-meteo-france", - is_flag=True, - help="Exclude Météo France datasets", + "-e", + "--exclude-org", + help="Exclude some org datasets", ) @click.option( - "-omf", - "--only-meteo-france", - is_flag=True, - help="Only Météo France datasets", + "-o", + "--only-org", + help="Only datasets from this org", ) @click.option( "-f", @@ -420,11 +419,11 @@ def check_integrity(models): help="Auto-fix some problems", ) def check_duplicate_resources_ids( - duplicate_inside_dataset, - duplicate_outside_dataset, - exclude_meteo_france, - only_meteo_france, - fix, + skip_duplicates_inside_dataset: bool, + skip_duplicates_outside_dataset: bool, + exclude_org: Optional[str], + only_org: Optional[str], + fix: bool, ): resources = {} @@ -452,24 +451,24 @@ def get_checksum_value(resource: Resource): count_resources = 0 count_datasets = 0 for id, info in resources.items(): - if len(info["datasets"]) == 1 and not duplicate_inside_dataset: + if len(info["datasets"]) == 1 and skip_duplicates_inside_dataset: continue - if len(info["datasets"]) > 1 and not duplicate_outside_dataset: + if len(info["datasets"]) > 1 and skip_duplicates_outside_dataset: continue # Filter out meteo france if ( - exclude_meteo_france + exclude_org and list(info["datasets"])[0].organization - and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4" + and str(list(info["datasets"])[0].organization.id) == exclude_org ): continue # Filter everything except meteo france - if only_meteo_france and ( + if only_org and ( not list(info["datasets"])[0].organization - or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4" + or str(list(info["datasets"])[0].organization.id) != only_org ): continue From dd1a10885be77d92ec76d1633bbfd0b144b49602 Mon Sep 17 00:00:00 2001 From: Thibaud Dauce Date: Wed, 29 Jan 2025 10:52:48 +0100 Subject: [PATCH 8/8] Add dry run tag if not a fix and simplify logic around new_resources --- udata/commands/db.py | 48 ++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/udata/commands/db.py b/udata/commands/db.py index 048e9ab11b..a15153a8b4 100644 --- a/udata/commands/db.py +++ b/udata/commands/db.py @@ -426,6 +426,7 @@ def check_duplicate_resources_ids( fix: bool, ): resources = {} + dry_run = "[ DONE ]" if fix else "[DRYRUN]" def get_checksum_value(resource: Resource): if resource.checksum: @@ -499,46 +500,35 @@ def get_checksum_value(resource: Resource): and resource1.url == resource2.url ): print( - "Since checksum and URL are the same, fixing by removing the second resource…\n" + f"{dry_run} Since checksum and URL are the same, fixing by removing the second resource…\n" ) - for r in dataset.resources: - # If it's the duplicated resource we're interested in and - # that ID was already added to the new_resources (so we are - # on the second resource), do not add it. - if r.id == id and id in [r.id for r in new_resources]: - continue - - new_resources.append(r) + new_resources = [r for r in dataset.resources if r != resource2] else: print( - "Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n" + f"{dry_run} Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n" ) - for r in dataset.resources: - # If it's the duplicated resource we're interested in and - # that ID was already added to the new_resources (so we are - # on the second resource), generate a new UUID - if r.id == id and id in [r.id for r in new_resources]: - # Just for logging we copy the resource to avoid changing the ID - # on the original resource (and have a clear compare at the end) - new_r = copy.deepcopy(r) - new_r.id = uuid4() - highlight_ids.append(new_r.id) - - new_resources.append(new_r) - else: - new_resources.append(r) - - print(f"Previous resources ({len(dataset.resources)})") + # Just for logging we copy the resource to avoid changing the ID + # on the original resource (and have a clear compare at the end) + new_resource2 = copy.deepcopy(resource2) + new_resource2.id = uuid4() + highlight_ids.append(new_resource2.id) + + # Replace `resource2` by `new_resource2` in the `new_resources` array. + new_resources = [ + (new_resource2 if r == resource2 else r) for r in dataset.resources + ] + + print(f"{dry_run} Previous resources ({len(dataset.resources)})") for r in dataset.resources: highlight = " <---- CHANGED !" if r.id in highlight_ids else "" - print(f"\t{r.id} {r.title} {highlight}") + print(f"{dry_run} \t{r.id} {r.title} {highlight}") - print(f"New resources ({len(new_resources)})") + print(f"{dry_run} New resources ({len(new_resources)})") for r in new_resources: highlight = " <---- CHANGED !" if r.id in highlight_ids else "" - print(f"\t{r.id} {r.title} {highlight}") + print(f"{dry_run} \t{r.id} {r.title} {highlight}") if fix: dataset.resources = new_resources