From bd5063025fda28c070cd86f04295d41838c9002d Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Tue, 14 Jan 2025 09:33:52 +0100
Subject: [PATCH 1/8] Add report of duplicates resources ids

---
 udata/commands/db.py | 122 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index e894f4f78a..98f312d338 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -12,6 +12,7 @@
 from udata import models as core_models
 from udata.api import oauth2 as oauth2_models
 from udata.commands import cli, cyan, echo, green, magenta, red, white, yellow
+from udata.core.dataset.models import Dataset
 from udata.harvest import models as harvest_models
 from udata.mongo import db
 
@@ -394,3 +395,124 @@ def print_and_save(text: str):
 def check_integrity(models):
     """Check the integrity of the database from a business perspective"""
     check_references(models)
+
+
+@grp.command()
+@click.option(
+    "-did",
+    "--duplicate-inside-dataset",
+    is_flag=True,
+    help="Show duplicates inside the same dataset (same resource ID inside one dataset)",
+)
+@click.option(
+    "-dod",
+    "--duplicate-outside-dataset",
+    is_flag=True,
+    help="Show duplicates outside (same resource ID shared between datasets)",
+)
+@click.option(
+    "-emf",
+    "--exclude-meteo-france",
+    is_flag=True,
+    help="Exclude Météo France datasets",
+)
+@click.option(
+    "-omf",
+    "--only-meteo-france",
+    is_flag=True,
+    help="Only Météo France datasets",
+)
+def check_duplicate_resources_ids(
+    duplicate_inside_dataset, duplicate_outside_dataset, exclude_meteo_france, only_meteo_france
+):
+    resources = {}
+
+    with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets:
+        for dataset in datasets:
+            for resource in dataset.resources:
+                if resource.id not in resources:
+                    resources[resource.id] = {"resources": [], "datasets": set()}
+                resources[resource.id]["resources"].append(resource)
+                resources[resource.id]["datasets"].add(dataset)
+
+    resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1}
+
+    if duplicate_inside_dataset:
+        count_resources = 0
+        count_datasets = 0
+        for id, info in resources.items():
+            if len(info["datasets"]) == 1:
+                continue
+
+            # Filter out meteo france
+            if (
+                exclude_meteo_france
+                and list(info["datasets"])[0].organization
+                and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
+            ):
+                continue
+
+            # Filter everything except meteo france
+            if only_meteo_france and (
+                not list(info["datasets"])[0].organization
+                or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
+            ):
+                continue
+
+            count = len(info["resources"])
+            print(f"With ID {id}: {count} resources")
+            for dataset in info["datasets"]:
+                count_datasets += 1
+                print(f"\t- Dataset#{dataset.id} {dataset.title}")
+            print("")
+            for resource in info["resources"]:
+                count_resources += 1
+                print(f"\t- Resource {resource.title}")
+            print()
+            print("---")
+            print("---")
+            print("---")
+            print()
+
+        print(f"Resources with duplicated IDs: {count_resources}")
+        print(f"Datasets concerned {count_datasets}")
+
+    if duplicate_outside_dataset:
+        count_resources = 0
+        count_datasets = 0
+        for id, info in resources.items():
+            if len(info["datasets"]) > 1:
+                continue
+
+            # Filter out meteo france
+            if (
+                exclude_meteo_france
+                and list(info["datasets"])[0].organization
+                and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
+            ):
+                continue
+
+            # Filter everything except meteo france
+            if only_meteo_france and (
+                not list(info["datasets"])[0].organization
+                or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
+            ):
+                continue
+
+            count = len(info["resources"])
+            print(f"With ID {id}: {count} resources")
+            for dataset in info["datasets"]:
+                count_datasets += 1
+                print(f"\t- Dataset#{dataset.id} {dataset.title}")
+            print("")
+            for resource in info["resources"]:
+                count_resources += 1
+                print(f"\t- Resource {resource.title}")
+            print()
+            print("---")
+            print("---")
+            print("---")
+            print()
+
+        print(f"Resources with duplicated IDs: {count_resources}")
+        print(f"Datasets concerned {count_datasets}")

From c42a8a2a2f8cb31642d728ce63b835d2f3d406f1 Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Tue, 14 Jan 2025 09:40:22 +0100
Subject: [PATCH 2/8] Fix inversion and add changelog

---
 CHANGELOG.md         | 2 +-
 udata/commands/db.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b8d5b96fe..abcf51c74c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 ## Current (in progress)
 
-- Nothing yet
+- Add report of duplicates resources ids [#3247](https://github.com/opendatateam/udata/pull/3247)
 
 ## 10.0.7 (2025-01-13)
 
diff --git a/udata/commands/db.py b/udata/commands/db.py
index 98f312d338..74806cfddf 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -437,7 +437,7 @@ def check_duplicate_resources_ids(
 
     resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1}
 
-    if duplicate_inside_dataset:
+    if duplicate_outside_dataset:
         count_resources = 0
         count_datasets = 0
         for id, info in resources.items():
@@ -477,7 +477,7 @@ def check_duplicate_resources_ids(
         print(f"Resources with duplicated IDs: {count_resources}")
         print(f"Datasets concerned {count_datasets}")
 
-    if duplicate_outside_dataset:
+    if duplicate_inside_dataset:
         count_resources = 0
         count_datasets = 0
         for id, info in resources.items():

From 23b9edb180588dfa9ca8bc7a54cf7aa71e33c395 Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Tue, 14 Jan 2025 16:28:11 +0100
Subject: [PATCH 3/8] Add checksum info

---
 udata/commands/db.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index 74806cfddf..aded9eb084 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -12,7 +12,7 @@
 from udata import models as core_models
 from udata.api import oauth2 as oauth2_models
 from udata.commands import cli, cyan, echo, green, magenta, red, white, yellow
-from udata.core.dataset.models import Dataset
+from udata.core.dataset.models import Dataset, Resource
 from udata.harvest import models as harvest_models
 from udata.mongo import db
 
@@ -427,6 +427,15 @@ def check_duplicate_resources_ids(
 ):
     resources = {}
 
+    def get_additional_info(resource: Resource):
+        if resource.checksum:
+            return f" ({resource.checksum.type} {resource.checksum.value} / {resource.url})"
+
+        if "analysis:checksum" in resource.extras:
+            return f" ({resource.extras['analysis:checksum']} / {resource.url})"
+
+        return f" ({resource.url})"
+
     with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets:
         for dataset in datasets:
             for resource in dataset.resources:
@@ -467,7 +476,7 @@ def check_duplicate_resources_ids(
             print("")
             for resource in info["resources"]:
                 count_resources += 1
-                print(f"\t- Resource {resource.title}")
+                print(f"\t- Resource {resource.title}{get_additional_info(resource)}")
             print()
             print("---")
             print("---")
@@ -507,7 +516,7 @@ def check_duplicate_resources_ids(
             print("")
             for resource in info["resources"]:
                 count_resources += 1
-                print(f"\t- Resource {resource.title}")
+                print(f"\t- Resource {resource.title}{get_additional_info(resource)}")
             print()
             print("---")
             print("---")

From d0601342598d8183263b5fcbbf4fb7fff8969d8f Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Wed, 22 Jan 2025 16:14:20 +0100
Subject: [PATCH 4/8] Add --fix to command

---
 udata/commands/db.py | 213 ++++++++++++++++++++++++++-----------------
 1 file changed, 127 insertions(+), 86 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index aded9eb084..3cf0278843 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -1,8 +1,10 @@
 import collections
+import copy
 import logging
 import os
 import traceback
 from itertools import groupby
+from uuid import uuid4
 
 import click
 import mongoengine
@@ -292,7 +294,7 @@ def print_and_save(text: str):
 
     print("Those references will be inspected:")
     for reference in references:
-        print(f'- {reference["repr"]}({reference["destination"]}) — {reference["type"]}')
+        print(f"- {reference['repr']}({reference['destination']}) — {reference['type']}")
     print("")
 
     total = 0
@@ -306,7 +308,7 @@ def print_and_save(text: str):
         with click.progressbar(qs, length=count) as models:
             for obj in models:
                 for reference in model_references:
-                    key = f'\t- {reference["repr"]}({reference["destination"]}) — {reference["type"]}…'
+                    key = f"\t- {reference['repr']}({reference['destination']}) — {reference['type']}…"
                     if key not in errors[model]:
                         errors[model][key] = 0
 
@@ -317,7 +319,7 @@ def print_and_save(text: str):
                             except mongoengine.errors.DoesNotExist:
                                 errors[model][key] += 1
                                 print_and_save(
-                                    f'\t{model.__name__}#{obj.id} have a broken reference for `{reference["name"]}`'
+                                    f"\t{model.__name__}#{obj.id} have a broken reference for `{reference['name']}`"
                                 )
                         elif reference["type"] == "list":
                             attr_list = getattr(obj, reference["name"], [])
@@ -327,7 +329,7 @@ def print_and_save(text: str):
                                 if isinstance(sub, DBRef):
                                     errors[model][key] += 1
                                     print_and_save(
-                                        f'\t{model.__name__}#{obj.id} have a broken reference for {reference["name"]}[{i}]'
+                                        f"\t{model.__name__}#{obj.id} have a broken reference for {reference['name']}[{i}]"
                                     )
                         elif reference["type"] == "embed_list":
                             p1, p2 = reference["name"].split("__")
@@ -367,7 +369,7 @@ def print_and_save(text: str):
                                         f"\t{model.__name__}#{obj.id} have a broken reference for {p1}.{p2}[{i}]"
                                     )
                         else:
-                            print_and_save(f'Unknown ref type {reference["type"]}')
+                            print_and_save(f"Unknown ref type {reference['type']}")
                     except mongoengine.errors.FieldDoesNotExist:
                         print_and_save(
                             f"[ERROR for {model.__name__} {obj.id}] {traceback.format_exc()}"
@@ -422,106 +424,145 @@ def check_integrity(models):
     is_flag=True,
     help="Only Météo France datasets",
 )
+@click.option(
+    "-f",
+    "--fix",
+    is_flag=True,
+    help="Auto-fix some problems",
+)
 def check_duplicate_resources_ids(
-    duplicate_inside_dataset, duplicate_outside_dataset, exclude_meteo_france, only_meteo_france
+    duplicate_inside_dataset,
+    duplicate_outside_dataset,
+    exclude_meteo_france,
+    only_meteo_france,
+    fix,
 ):
     resources = {}
 
-    def get_additional_info(resource: Resource):
+    def get_checksum_value(resource: Resource):
         if resource.checksum:
-            return f" ({resource.checksum.type} {resource.checksum.value} / {resource.url})"
+            return resource.checksum.value
 
         if "analysis:checksum" in resource.extras:
-            return f" ({resource.extras['analysis:checksum']} / {resource.url})"
+            return resource.extras["analysis:checksum"]
 
-        return f" ({resource.url})"
+        return None
 
-    with click.progressbar(Dataset.objects, Dataset.objects().count()) as datasets:
+    with click.progressbar(
+        Dataset.objects,
+        Dataset.objects().count(),
+    ) as datasets:
         for dataset in datasets:
             for resource in dataset.resources:
                 if resource.id not in resources:
                     resources[resource.id] = {"resources": [], "datasets": set()}
+
                 resources[resource.id]["resources"].append(resource)
                 resources[resource.id]["datasets"].add(dataset)
 
     resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1}
 
-    if duplicate_outside_dataset:
-        count_resources = 0
-        count_datasets = 0
-        for id, info in resources.items():
-            if len(info["datasets"]) == 1:
-                continue
+    count_resources = 0
+    count_datasets = 0
+    for id, info in resources.items():
+        if len(info["datasets"]) == 1 and not duplicate_inside_dataset:
+            continue
 
-            # Filter out meteo france
-            if (
-                exclude_meteo_france
-                and list(info["datasets"])[0].organization
-                and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
-            ):
-                continue
+        if len(info["datasets"]) > 1 and not duplicate_outside_dataset:
+            continue
 
-            # Filter everything except meteo france
-            if only_meteo_france and (
-                not list(info["datasets"])[0].organization
-                or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
-            ):
-                continue
-
-            count = len(info["resources"])
-            print(f"With ID {id}: {count} resources")
-            for dataset in info["datasets"]:
-                count_datasets += 1
-                print(f"\t- Dataset#{dataset.id} {dataset.title}")
-            print("")
-            for resource in info["resources"]:
-                count_resources += 1
-                print(f"\t- Resource {resource.title}{get_additional_info(resource)}")
-            print()
-            print("---")
-            print("---")
-            print("---")
-            print()
-
-        print(f"Resources with duplicated IDs: {count_resources}")
-        print(f"Datasets concerned {count_datasets}")
-
-    if duplicate_inside_dataset:
-        count_resources = 0
-        count_datasets = 0
-        for id, info in resources.items():
-            if len(info["datasets"]) > 1:
-                continue
-
-            # Filter out meteo france
-            if (
-                exclude_meteo_france
-                and list(info["datasets"])[0].organization
-                and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
-            ):
-                continue
+        # Filter out meteo france
+        if (
+            exclude_meteo_france
+            and list(info["datasets"])[0].organization
+            and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
+        ):
+            continue
+
+        # Filter everything except meteo france
+        if only_meteo_france and (
+            not list(info["datasets"])[0].organization
+            or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
+        ):
+            continue
+
+        count = len(info["resources"])
+        print(f"With ID {id}: {count} resources")
+        for dataset in info["datasets"]:
+            count_datasets += 1
+            print(f"\t- Dataset#{dataset.id} {dataset.title}")
+        print("")
+        for resource in info["resources"]:
+            count_resources += 1
+            print(
+                f"\t- Resource {resource.title} ({get_checksum_value(resource)} / {resource.url})"
+            )
 
-            # Filter everything except meteo france
-            if only_meteo_france and (
-                not list(info["datasets"])[0].organization
-                or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
+        print()
+
+        if len(info["datasets"]) == 1 and len(info["resources"]) == 2:
+            dataset = next(iter(info["datasets"]))
+
+            resource1 = info["resources"][0]
+            resource2 = info["resources"][1]
+
+            new_resources = []
+            highlight_ids = [id]
+            if (
+                get_checksum_value(resource1) == get_checksum_value(resource2)
+                and resource1.url == resource2.url
             ):
-                continue
-
-            count = len(info["resources"])
-            print(f"With ID {id}: {count} resources")
-            for dataset in info["datasets"]:
-                count_datasets += 1
-                print(f"\t- Dataset#{dataset.id} {dataset.title}")
-            print("")
-            for resource in info["resources"]:
-                count_resources += 1
-                print(f"\t- Resource {resource.title}{get_additional_info(resource)}")
-            print()
-            print("---")
-            print("---")
-            print("---")
-            print()
-
-        print(f"Resources with duplicated IDs: {count_resources}")
-        print(f"Datasets concerned {count_datasets}")
+                print(
+                    "Since checksum and URL are the same, fixing by removing the second resource…\n"
+                )
+
+                for r in dataset.resources:
+                    # If it's the duplicated resource we're interested in and
+                    # that ID was already added to the new_resources (so we are
+                    # on the second resource), do not add it.
+                    if r.id == id and id in [r.id for r in new_resources]:
+                        continue
+
+                    new_resources.append(r)
+            else:
+                print(
+                    "Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n"
+                )
+
+                for r in dataset.resources:
+                    # If it's the duplicated resource we're interested in and
+                    # that ID was already added to the new_resources (so we are
+                    # on the second resource), generate a new UUID
+                    if r.id == id and id in [r.id for r in new_resources]:
+                        # Just for logging we copy the resource to avoid changing the ID
+                        # on the original resource (and have a clear compare at the end)
+                        new_r = copy.deepcopy(r)
+                        new_r.id = uuid4()
+                        highlight_ids.append(new_r.id)
+
+                        new_resources.append(new_r)
+                    else:
+                        new_resources.append(r)
+
+            print(f"Previous resources ({len(dataset.resources)})")
+            for r in dataset.resources:
+                highlight = " <---- CHANGED !" if r.id in highlight_ids else ""
+                print(f"\t{r.id} {r.title} {highlight}")
+
+            print(f"New resources ({len(new_resources)})")
+            for r in new_resources:
+                highlight = " <---- CHANGED !" if r.id in highlight_ids else ""
+                print(f"\t{r.id} {r.title} {highlight}")
+
+            if fix:
+                dataset.resources = new_resources
+                dataset.save()
+
+        print()
+        print("---")
+        print("---")
+        print("---")
+        print()
+
+    print(f"Resources with duplicated IDs: {count_resources}")
+    print(f"Datasets concerned {count_datasets}")

From 55bc3bc30716781cbc1110a4ef98963a07b9a2aa Mon Sep 17 00:00:00 2001
From: maudetes <maudet.estelle@gmail.com>
Date: Tue, 28 Jan 2025 17:49:55 +0100
Subject: [PATCH 5/8] Fix typo during merge conflict

---
 udata/commands/db.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index d4bde09e00..c9a2aa904b 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -21,8 +21,7 @@
 log = logging.getLogger(__name__)
 
 
-@cli.group("
-           ")
+@cli.group("db")
 def grp():
     """Database related operations"""
     pass

From aa508b1db8a923b0d456b0b3eaad9c54bc982f3e Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <ThibaudDauce@users.noreply.github.com>
Date: Wed, 29 Jan 2025 10:25:50 +0100
Subject: [PATCH 6/8] Apply suggestions from code review

Co-authored-by: maudetes <maudet.estelle@gmail.com>
---
 udata/commands/db.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index c9a2aa904b..d2f804490d 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -432,10 +432,7 @@ def get_checksum_value(resource: Resource):
         if resource.checksum:
             return resource.checksum.value
 
-        if "analysis:checksum" in resource.extras:
-            return resource.extras["analysis:checksum"]
-
-        return None
+        return resource.extras.get("analysis:checksum")
 
     with click.progressbar(
         Dataset.objects,
@@ -449,6 +446,7 @@ def get_checksum_value(resource: Resource):
                 resources[resource.id]["resources"].append(resource)
                 resources[resource.id]["datasets"].add(dataset)
 
+    # Keep duplicated resources only
     resources = {id: info for id, info in resources.items() if len(info["resources"]) != 1}
 
     count_resources = 0

From 5b83e5caf8a5f092b029b667905c9ebb9be02eaa Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Wed, 29 Jan 2025 10:32:20 +0100
Subject: [PATCH 7/8] Some arguments changes

---
 udata/commands/db.py | 49 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index d2f804490d..048e9ab11b 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -4,6 +4,7 @@
 import os
 import traceback
 from itertools import groupby
+from typing import Optional
 from uuid import uuid4
 
 import click
@@ -390,28 +391,26 @@ def check_integrity(models):
 
 @grp.command()
 @click.option(
-    "-did",
-    "--duplicate-inside-dataset",
+    "-sdid",
+    "--skip-duplicates-inside-dataset",
     is_flag=True,
-    help="Show duplicates inside the same dataset (same resource ID inside one dataset)",
+    help="Do not show duplicates inside the same dataset (same resource ID inside one dataset)",
 )
 @click.option(
-    "-dod",
-    "--duplicate-outside-dataset",
+    "-sdod",
+    "--skip-duplicates-outside-dataset",
     is_flag=True,
-    help="Show duplicates outside (same resource ID shared between datasets)",
+    help="Do not show duplicates between datasets (same resource ID shared between datasets)",
 )
 @click.option(
-    "-emf",
-    "--exclude-meteo-france",
-    is_flag=True,
-    help="Exclude Météo France datasets",
+    "-e",
+    "--exclude-org",
+    help="Exclude some org datasets",
 )
 @click.option(
-    "-omf",
-    "--only-meteo-france",
-    is_flag=True,
-    help="Only Météo France datasets",
+    "-o",
+    "--only-org",
+    help="Only datasets from this org",
 )
 @click.option(
     "-f",
@@ -420,11 +419,11 @@ def check_integrity(models):
     help="Auto-fix some problems",
 )
 def check_duplicate_resources_ids(
-    duplicate_inside_dataset,
-    duplicate_outside_dataset,
-    exclude_meteo_france,
-    only_meteo_france,
-    fix,
+    skip_duplicates_inside_dataset: bool,
+    skip_duplicates_outside_dataset: bool,
+    exclude_org: Optional[str],
+    only_org: Optional[str],
+    fix: bool,
 ):
     resources = {}
 
@@ -452,24 +451,24 @@ def get_checksum_value(resource: Resource):
     count_resources = 0
     count_datasets = 0
     for id, info in resources.items():
-        if len(info["datasets"]) == 1 and not duplicate_inside_dataset:
+        if len(info["datasets"]) == 1 and skip_duplicates_inside_dataset:
             continue
 
-        if len(info["datasets"]) > 1 and not duplicate_outside_dataset:
+        if len(info["datasets"]) > 1 and skip_duplicates_outside_dataset:
             continue
 
         # Filter out meteo france
         if (
-            exclude_meteo_france
+            exclude_org
             and list(info["datasets"])[0].organization
-            and str(list(info["datasets"])[0].organization.id) == "534fff8ba3a7292c64a77ed4"
+            and str(list(info["datasets"])[0].organization.id) == exclude_org
         ):
             continue
 
         # Filter everything except meteo france
-        if only_meteo_france and (
+        if only_org and (
             not list(info["datasets"])[0].organization
-            or str(list(info["datasets"])[0].organization.id) != "534fff8ba3a7292c64a77ed4"
+            or str(list(info["datasets"])[0].organization.id) != only_org
         ):
             continue
 

From dd1a10885be77d92ec76d1633bbfd0b144b49602 Mon Sep 17 00:00:00 2001
From: Thibaud Dauce <git@hadibut.fr>
Date: Wed, 29 Jan 2025 10:52:48 +0100
Subject: [PATCH 8/8] Add dry run tag if not a fix and simplify logic around
 new_resources

---
 udata/commands/db.py | 48 ++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/udata/commands/db.py b/udata/commands/db.py
index 048e9ab11b..a15153a8b4 100644
--- a/udata/commands/db.py
+++ b/udata/commands/db.py
@@ -426,6 +426,7 @@ def check_duplicate_resources_ids(
     fix: bool,
 ):
     resources = {}
+    dry_run = "[ DONE ]" if fix else "[DRYRUN]"
 
     def get_checksum_value(resource: Resource):
         if resource.checksum:
@@ -499,46 +500,35 @@ def get_checksum_value(resource: Resource):
                 and resource1.url == resource2.url
             ):
                 print(
-                    "Since checksum and URL are the same, fixing by removing the second resource…\n"
+                    f"{dry_run} Since checksum and URL are the same, fixing by removing the second resource…\n"
                 )
 
-                for r in dataset.resources:
-                    # If it's the duplicated resource we're interested in and
-                    # that ID was already added to the new_resources (so we are
-                    # on the second resource), do not add it.
-                    if r.id == id and id in [r.id for r in new_resources]:
-                        continue
-
-                    new_resources.append(r)
+                new_resources = [r for r in dataset.resources if r != resource2]
             else:
                 print(
-                    "Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n"
+                    f"{dry_run} Since checksum and URL are not the same, fixing by setting a new ID on second resource…\n"
                 )
 
-                for r in dataset.resources:
-                    # If it's the duplicated resource we're interested in and
-                    # that ID was already added to the new_resources (so we are
-                    # on the second resource), generate a new UUID
-                    if r.id == id and id in [r.id for r in new_resources]:
-                        # Just for logging we copy the resource to avoid changing the ID
-                        # on the original resource (and have a clear compare at the end)
-                        new_r = copy.deepcopy(r)
-                        new_r.id = uuid4()
-                        highlight_ids.append(new_r.id)
-
-                        new_resources.append(new_r)
-                    else:
-                        new_resources.append(r)
-
-            print(f"Previous resources ({len(dataset.resources)})")
+                # Just for logging we copy the resource to avoid changing the ID
+                # on the original resource (and have a clear compare at the end)
+                new_resource2 = copy.deepcopy(resource2)
+                new_resource2.id = uuid4()
+                highlight_ids.append(new_resource2.id)
+
+                # Replace `resource2` by `new_resource2` in the `new_resources` array.
+                new_resources = [
+                    (new_resource2 if r == resource2 else r) for r in dataset.resources
+                ]
+
+            print(f"{dry_run} Previous resources ({len(dataset.resources)})")
             for r in dataset.resources:
                 highlight = " <---- CHANGED !" if r.id in highlight_ids else ""
-                print(f"\t{r.id} {r.title} {highlight}")
+                print(f"{dry_run} \t{r.id} {r.title} {highlight}")
 
-            print(f"New resources ({len(new_resources)})")
+            print(f"{dry_run} New resources ({len(new_resources)})")
             for r in new_resources:
                 highlight = " <---- CHANGED !" if r.id in highlight_ids else ""
-                print(f"\t{r.id} {r.title} {highlight}")
+                print(f"{dry_run} \t{r.id} {r.title} {highlight}")
 
             if fix:
                 dataset.resources = new_resources