Rename "irrelevants" to "off-topic" for Clarity (#8)

* started with renaming * version bump * added missing target * installation problem fixes * optimized req * fixed package building * fixed last bit of issues and ran examples * added test for backwards compability check
Digital-Dermatology · Jan 7, 2025 · 1b4411b · 1b4411b
2 parents b6295f0 + 9a60045
commit 1b4411b
Show file tree

Hide file tree

Showing 50 changed files with 353 additions and 350 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "ssl_library"]
-	path = src/ssl_library
+	path = selfclean/ssl_library
 	url = https://github.com/mse-thesis-dermatology/ssl_library
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,7 @@
 FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-runtime
 
 RUN apt-get update
-RUN apt-get install -y apt-transport-https
-RUN apt-get install -y libtcmalloc-minimal4
-RUN apt-get install -y libomp-dev
-RUN apt-get install -y sox
-RUN apt-get install -y git
-RUN apt-get install -y gcc g++ python3-dev python-dev
+RUN apt-get install -y apt-transport-https libtcmalloc-minimal4 libomp-dev sox git gcc g++ python3-dev python-dev
 RUN apt-get clean
 
 RUN pip install --upgrade pip

diff --git a/Makefile b/Makefile
@@ -84,11 +84,19 @@ DOCKER_CMD := docker run $(DOCKER_ARGS) $(GPU_ARGS) $(DOCKER_CONTAINER_NAME) -it
 ###########################
 # PROJECT UTILS
 ###########################
+.PHONY: init
+init:  ##@Utils initializes the project and pulls all the nessecary data
+	@git submodule update --init --recursive
+
 .PHONY: install
 install:  ##@Utils install the dependencies for the project
 	@python3 -m pip install -r requirements.txt
 	@pre-commit install
 
+.PHONY: update_dependencies
+update_dependencies:  ##@Utils updates the dependencies for the project in the toml file
+	@python3.9 -m update_dependencies
+
 .PHONY: clean
 clean:  ##@Utils clean the project
 	@black .
@@ -126,8 +134,8 @@ start_jupyter: _build  ##@Docker start a jupyter notebook inside the docker imag
 ###########################
 .PHONY: test
 test: _build  ##@Test run all tests in the project
-	$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml; coverage report -i --include=src/* --omit="src/ssl_library/*"; coverage xml -i --include=src/* --omit="src/ssl_library/*";"
+	$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml; coverage report -i --include=selfclean/* --omit="selfclean/ssl_library/*"; coverage xml -i --include=selfclean/* --omit="selfclean/ssl_library/*";"
 
 .PHONY: unittest
 unittest: _build  ##@Test run all unittests in the project
-	$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml --ignore=tests/integration_tests; coverage report -i --include=src/* --omit="src/ssl_library/*"; coverage xml -i --include=src/* --omit="src/ssl_library/*";"
+	$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml --ignore=tests/integration_tests; coverage report -i --include=selfclean/* --omit="selfclean/ssl_library/*"; coverage xml -i --include=selfclean/* --omit="selfclean/ssl_library/*";"
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ![SelfClean Teaser](https://github.com/Digital-Dermatology/SelfClean/raw/main/assets/SelfClean_Teaser.png)
 
-A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates, and label errors.
+A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates, and label errors.
 
 **Publications:** [SelfClean Paper (NeurIPS24)](https://arxiv.org/abs/2305.17048) | [Data Cleaning Protocol Paper (ML4H23@NeurIPS)](https://arxiv.org/abs/2309.06961)
 
@@ -39,7 +39,7 @@ from selfclean import SelfClean
 selfclean = SelfClean(
     # displays the top-7 images from each error type
     # per default this option is disabled
-    plot_top_N=7, 
+    plot_top_N=7,
 )
 
 # run on pytorch dataset
@@ -53,7 +53,7 @@ issues = selfclean.run_on_image_folder(
 
 # get the data quality issue rankings
 df_near_duplicates = issues.get_issues("near_duplicates", return_as_df=True)
-df_irrelevants = issues.get_issues("irrelevants", return_as_df=True)
+df_off_topic_samples = issues.get_issues("off_topic_samples", return_as_df=True)
 df_label_errors = issues.get_issues("label_errors", return_as_df=True)
 ```
 

diff --git a/examples/Investigate_Imagenette.ipynb b/examples/Investigate_Imagenette.ipynb
diff --git a/examples/Investigate_OxfordIIITPet.ipynb b/examples/Investigate_OxfordIIITPet.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,22 +1,48 @@
+[build-system]
+requires = [ "setuptools>=42", "wheel",]
+build-backend = "setuptools.build_meta"
+
 [project]
-name = "SelfClean"
-description = "A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors."
-authors = [
-    { name = "Fabian Gröger", email = "[email protected]" },
-]
+version = "0.0.35"
+name = "selfclean"
+description = "A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates and label errors."
+readme = "README.md"
+keywords = [ "machine_learning", "data_cleaning", "datacentric_ai", "datacentric", "self-supervised learning",]
+requires-python = ">=3.6"
+classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent",]
+dependencies = [ "SciencePlots", "black>=22.6", "codecov", "coverage>=6", "darglint>=1.8", "einops", "isort>=5.10", "jupyter", "loguru", "matplotlib", "memory-profiler", "numpy", "pandas", "pre-commit>=2.20", "pytest", "pytest-cov>=3", "scikit-image", "scikit_learn", "seaborn", "torchinfo", "torchmetrics", "torchvision", "tqdm", "transformers[torch]==4.27.4",]
+[[project.authors]]
+name = "Fabian Gröger"
+email = "[email protected]"
+
+[project.license]
+text = "Attribution-NonCommercial 4.0 International"
+
+[project.optional-dependencies]
+approximate_nn = [ "annoy",]
+
+[project.urls]
+Homepage = "https://selfclean.github.io/"
+"Source Code" = "https://github.com/Digital-Dermatology/SelfClean"
+
+[tool.setuptools]
+include-package-data = true
 
 [tool.black]
-include = '\.pyi?$'
+include = "\\.pyi?$"
 
 [tool.isort]
 profile = "black"
-skip_gitignore=true
-py_version=39
+skip_gitignore = true
+py_version = 39
 default_section = "THIRDPARTY"
-known_thirdparty=["wandb"]
+known_thirdparty = [ "wandb",]
 
 [tool.pytest.ini_options]
-# Set true to see logger ouput in test command line window
 log_cli = false
 log_cli_level = "INFO"
 log_cli_format = "%(time)s :: %(name)s :: %(message)s"
+
+[tool.setuptools.packages.find]
+include = [ "selfclean*", "selfclean.*",]
+exclude = [ "tests*", "tests.*", "*.tests", "*.tests.*", "*.tests*",]
diff --git a/requirements.txt b/requirements.txt
@@ -14,7 +14,7 @@ darglint>=1.8
 isort>=5.10
 pre-commit>=2.20
 pytest-cov>=3
-transformers
+transformers[torch]==4.27.4
 seaborn
 SciencePlots
 scikit-image

diff --git a/selfclean/__init__.py b/selfclean/__init__.py
@@ -0,0 +1,9 @@
+"""
+SelfClean.
+
+A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates and label errors.
+"""
+
+__author__ = "Fabian Groeger"
+
+from .cleaner.selfclean import SelfClean  # noqa: F401
diff --git a/src/cleaner/__init__.py → selfclean/cleaner/__init__.py b/src/cleaner/__init__.py → selfclean/cleaner/__init__.py
diff --git a/src/cleaner/auto_cleaning_mixin.py → selfclean/cleaner/auto_cleaning_mixin.py b/src/cleaner/auto_cleaning_mixin.py → selfclean/cleaner/auto_cleaning_mixin.py
@@ -19,7 +19,7 @@ class AutoCleaningMixin:
     def __init__(
         self,
         auto_cleaning: bool = False,
-        irrelevant_cut_off: float = 0.01,
+        off_topic_cut_off: float = 0.01,
         near_duplicate_cut_off: float = 0.01,
         label_error_cut_off: float = 0.01,
         significance_level: float = 0.05,
@@ -28,7 +28,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self.auto_cleaning = auto_cleaning
-        self.irrelevant_cut_off = irrelevant_cut_off
+        self.off_topic_cut_off = off_topic_cut_off
         self.near_duplicate_cut_off = near_duplicate_cut_off
         self.label_error_cut_off = label_error_cut_off
         self.significance_level = significance_level
@@ -58,19 +58,19 @@ def perform_auto_cleaning(
                 )
                 return_dict["near_duplicates"]["auto_issues"] = issues_dup
 
-            # Irrelevant Samples
-            irrelevant_issues = issue_manger["irrelevants"]
-            if irrelevant_issues is not None:
+            # Off-Topic Samples
+            off_topic_issues = issue_manger["off_topic_samples"]
+            if off_topic_issues is not None:
                 if output_path is not None:
                     self.cleaner_kwargs["path"] = (
                         f"{output_path.stem}_auto_oods{output_path.suffix}"
                     )
-                self.cleaner_kwargs["alpha"] = self.irrelevant_cut_off
-                issues_ood = self.fraction_cut(
-                    scores=irrelevant_issues["scores"],
+                self.cleaner_kwargs["alpha"] = self.off_topic_cut_off
+                issues_ot = self.fraction_cut(
+                    scores=off_topic_issues["scores"],
                     **self.cleaner_kwargs,
                 )
-                return_dict["irrelevants"]["auto_issues"] = issues_ood
+                return_dict["off_topic_samples"]["auto_issues"] = issues_ot
 
             # Label Errors
             label_error_issues = issue_manger["label_errors"]

diff --git a/src/cleaner/base_cleaner.py → selfclean/cleaner/base_cleaner.py b/src/cleaner/base_cleaner.py → selfclean/cleaner/base_cleaner.py
diff --git a/src/cleaner/issue_manager.py → selfclean/cleaner/issue_manager.py b/src/cleaner/issue_manager.py → selfclean/cleaner/issue_manager.py
@@ -7,7 +7,10 @@
 
 
 class IssueTypes(Enum):
+    # NOTE: We leave the accessability of "off-topic-samples"
+    # via "irrelevants" to ensure backwards compatibility
     IRRELEVANTS = "irrelevants"
+    OFF_TOPIC_SAMPLES = "off_topic_samples"
     NEAR_DUPLICATES = "near_duplicates"
     LABEL_ERRORS = "label_errors"
 
@@ -25,6 +28,10 @@ def get_issues(
         if issue_type is type(IssueTypes):
             issue_type = issue_type.value
 
+        # NOTE: Backwards compatibility with "irrelevants"
+        if issue_type == "irrelevants":
+            issue_type = IssueTypes.OFF_TOPIC_SAMPLES.value
+
         sel_issues = self.issue_dict.get(issue_type, None)
         if sel_issues is None:
             return sel_issues

diff --git a/src/cleaner/irrelevants/__init__.py → selfclean/cleaner/label_errors/__init__.py b/src/cleaner/irrelevants/__init__.py → selfclean/cleaner/label_errors/__init__.py
diff --git a/...er/label_errors/base_label_error_mixin.py → ...er/label_errors/base_label_error_mixin.py b/...er/label_errors/base_label_error_mixin.py → ...er/label_errors/base_label_error_mixin.py
diff --git a/...abel_errors/intra_extra_distance_mixin.py → ...abel_errors/intra_extra_distance_mixin.py b/...abel_errors/intra_extra_distance_mixin.py → ...abel_errors/intra_extra_distance_mixin.py
diff --git a/src/cleaner/label_errors/__init__.py → ...clean/cleaner/near_duplicates/__init__.py b/src/cleaner/label_errors/__init__.py → ...clean/cleaner/near_duplicates/__init__.py
diff --git a/...r_duplicates/base_near_duplicate_mixin.py → ...r_duplicates/base_near_duplicate_mixin.py b/...r_duplicates/base_near_duplicate_mixin.py → ...r_duplicates/base_near_duplicate_mixin.py
diff --git a/...ar_duplicates/embedding_distance_mixin.py → ...ar_duplicates/embedding_distance_mixin.py b/...ar_duplicates/embedding_distance_mixin.py → ...ar_duplicates/embedding_distance_mixin.py
diff --git a/src/cleaner/near_duplicates/__init__.py → ...ean/cleaner/off_topic_samples/__init__.py b/src/cleaner/near_duplicates/__init__.py → ...ean/cleaner/off_topic_samples/__init__.py
diff --git a/...aner/irrelevants/base_irrelevant_mixin.py → ...off_topic_samples/base_off_topic_mixin.py b/...aner/irrelevants/base_irrelevant_mixin.py → ...off_topic_samples/base_off_topic_mixin.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 
-class BaseIrrelevantMixin(ABC):
+class BaseOffTopicMixin(ABC):
     @abstractmethod
-    def get_irrelevant_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
         raise NotImplementedError()
diff --git a/src/cleaner/irrelevants/lad_mixin.py → ...an/cleaner/off_topic_samples/lad_mixin.py b/src/cleaner/irrelevants/lad_mixin.py → ...an/cleaner/off_topic_samples/lad_mixin.py
@@ -3,32 +3,32 @@
 import numpy as np
 from scipy.cluster.hierarchy import single
 
-from ...cleaner.irrelevants.base_irrelevant_mixin import BaseIrrelevantMixin
+from ...cleaner.off_topic_samples.base_off_topic_mixin import BaseOffTopicMixin
 from ...scoring.lad_scoring import LAD
 from ...ssl_library.src.utils.logging import plot_dist
 
 
-class LADIrrelevantMixin(BaseIrrelevantMixin):
+class LADOffTopicMixin(BaseOffTopicMixin):
     def __init__(self, global_leaves: bool = False, **kwargs):
         super().__init__(**kwargs)
         self.global_leaves = global_leaves
 
-    def get_irrelevant_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
+    def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
         # linkage_matrix: [idx1, idx2, dist, sample_count]
         linkage_matrix = single(self.p_distances)
         lad = LAD()
-        irrelevants = lad.calc_scores(
+        off_topic_samples = lad.calc_scores(
             linkage_matrix=linkage_matrix,
             global_leaves=self.global_leaves,
         )
         # free up allocated memory
         del lad, linkage_matrix
 
-        if self.plot_distribution and irrelevants is not None:
+        if self.plot_distribution and off_topic_samples is not None:
             plot_dist(
-                scores=np.asarray([x[0] for x in irrelevants]),
-                title="Distribution of irrelevant samples",
+                scores=np.asarray([x[0] for x in off_topic_samples]),
+                title="Distribution of off-topic samples",
             )
-        irrelevant_scores = np.asarray([x[0] for x in irrelevants])
-        irrelevant_indices = np.asarray([x[1] for x in irrelevants])
-        return irrelevant_scores, irrelevant_indices
+        off_topic_scores = np.asarray([x[0] for x in off_topic_samples])
+        off_topic_indices = np.asarray([x[1] for x in off_topic_samples])
+        return off_topic_scores, off_topic_indices
diff --git a/selfclean/cleaner/off_topic_samples/quantile_off_topic_mixin.py b/selfclean/cleaner/off_topic_samples/quantile_off_topic_mixin.py
@@ -0,0 +1,30 @@
+from typing import Tuple
+
+import numpy as np
+
+from ...cleaner.off_topic_samples.base_off_topic_mixin import BaseOffTopicMixin
+from ...ssl_library.src.utils.logging import plot_dist
+
+
+class QuantileOffTopicMixin(BaseOffTopicMixin):
+    def __init__(self, quantile: float = 0.01, **kwargs):
+        super().__init__(**kwargs)
+        self.quantile = quantile
+
+    def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
+        off_topic_samples = np.quantile(self.distance_matrix, self.quantile, axis=0)
+        off_topic_samples = [(off_topic_samples[i], i) for i in list(range(self.N))]
+        off_topic_samples = sorted(
+            off_topic_samples,
+            key=lambda tup: tup[0],
+            reverse=True,
+        )
+
+        if self.plot_distribution and off_topic_samples is not None:
+            plot_dist(
+                scores=np.asarray([x[0] for x in off_topic_samples]),
+                title="Distribution of off-topic samples",
+            )
+        off_topic_scores = np.asarray([x[0] for x in off_topic_samples])
+        off_topic_indices = np.asarray([x[1] for x in off_topic_samples])
+        return off_topic_scores, off_topic_indices
diff --git a/src/cleaner/selfclean.py → selfclean/cleaner/selfclean.py b/src/cleaner/selfclean.py → selfclean/cleaner/selfclean.py
@@ -137,7 +137,7 @@ def run_on_image_folder(
         hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
         issues_to_detect: List[IssueTypes] = [
             IssueTypes.NEAR_DUPLICATES,
-            IssueTypes.IRRELEVANTS,
+            IssueTypes.OFF_TOPIC_SAMPLES,
             IssueTypes.LABEL_ERRORS,
         ],
         # embedding
@@ -185,7 +185,7 @@ def run_on_dataset(
         hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
         issues_to_detect: List[IssueTypes] = [
             IssueTypes.NEAR_DUPLICATES,
-            IssueTypes.IRRELEVANTS,
+            IssueTypes.OFF_TOPIC_SAMPLES,
             IssueTypes.LABEL_ERRORS,
         ],
         # embedding
@@ -229,7 +229,7 @@ def _run(
         hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
         issues_to_detect: List[IssueTypes] = [
             IssueTypes.NEAR_DUPLICATES,
-            IssueTypes.IRRELEVANTS,
+            IssueTypes.OFF_TOPIC_SAMPLES,
             IssueTypes.LABEL_ERRORS,
         ],
         # embedding