Skip to content

Commit

Permalink
Rename "irrelevants" to "off-topic" for Clarity (#8)
Browse files Browse the repository at this point in the history
* started with renaming

* version bump

* added missing target

* installation problem fixes

* optimized req

* fixed package building

* fixed last bit of issues and ran examples

* added test for backwards compability check
  • Loading branch information
FabianGroeger96 authored Jan 7, 2025
2 parents b6295f0 + 9a60045 commit 1b4411b
Show file tree
Hide file tree
Showing 50 changed files with 353 additions and 350 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "ssl_library"]
path = src/ssl_library
path = selfclean/ssl_library
url = https://github.com/mse-thesis-dermatology/ssl_library
7 changes: 1 addition & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-runtime

RUN apt-get update
RUN apt-get install -y apt-transport-https
RUN apt-get install -y libtcmalloc-minimal4
RUN apt-get install -y libomp-dev
RUN apt-get install -y sox
RUN apt-get install -y git
RUN apt-get install -y gcc g++ python3-dev python-dev
RUN apt-get install -y apt-transport-https libtcmalloc-minimal4 libomp-dev sox git gcc g++ python3-dev python-dev
RUN apt-get clean

RUN pip install --upgrade pip
Expand Down
12 changes: 10 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,19 @@ DOCKER_CMD := docker run $(DOCKER_ARGS) $(GPU_ARGS) $(DOCKER_CONTAINER_NAME) -it
###########################
# PROJECT UTILS
###########################
.PHONY: init
init: ##@Utils initializes the project and pulls all the nessecary data
@git submodule update --init --recursive

.PHONY: install
install: ##@Utils install the dependencies for the project
@python3 -m pip install -r requirements.txt
@pre-commit install

.PHONY: update_dependencies
update_dependencies: ##@Utils updates the dependencies for the project in the toml file
@python3.9 -m update_dependencies

.PHONY: clean
clean: ##@Utils clean the project
@black .
Expand Down Expand Up @@ -126,8 +134,8 @@ start_jupyter: _build ##@Docker start a jupyter notebook inside the docker imag
###########################
.PHONY: test
test: _build ##@Test run all tests in the project
$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml; coverage report -i --include=src/* --omit="src/ssl_library/*"; coverage xml -i --include=src/* --omit="src/ssl_library/*";"
$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml; coverage report -i --include=selfclean/* --omit="selfclean/ssl_library/*"; coverage xml -i --include=selfclean/* --omit="selfclean/ssl_library/*";"

.PHONY: unittest
unittest: _build ##@Test run all unittests in the project
$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml --ignore=tests/integration_tests; coverage report -i --include=src/* --omit="src/ssl_library/*"; coverage xml -i --include=src/* --omit="src/ssl_library/*";"
$(DOCKER_CMD) /bin/bash -c "python3 -m coverage run -m pytest tests --junitxml=report.xml --ignore=tests/integration_tests; coverage report -i --include=selfclean/* --omit="selfclean/ssl_library/*"; coverage xml -i --include=selfclean/* --omit="selfclean/ssl_library/*";"
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

![SelfClean Teaser](https://github.com/Digital-Dermatology/SelfClean/raw/main/assets/SelfClean_Teaser.png)

A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates, and label errors.
A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates, and label errors.

**Publications:** [SelfClean Paper (NeurIPS24)](https://arxiv.org/abs/2305.17048) | [Data Cleaning Protocol Paper (ML4H23@NeurIPS)](https://arxiv.org/abs/2309.06961)

Expand Down Expand Up @@ -39,7 +39,7 @@ from selfclean import SelfClean
selfclean = SelfClean(
# displays the top-7 images from each error type
# per default this option is disabled
plot_top_N=7,
plot_top_N=7,
)

# run on pytorch dataset
Expand All @@ -53,7 +53,7 @@ issues = selfclean.run_on_image_folder(

# get the data quality issue rankings
df_near_duplicates = issues.get_issues("near_duplicates", return_as_df=True)
df_irrelevants = issues.get_issues("irrelevants", return_as_df=True)
df_off_topic_samples = issues.get_issues("off_topic_samples", return_as_df=True)
df_label_errors = issues.get_issues("label_errors", return_as_df=True)
```

Expand Down
136 changes: 58 additions & 78 deletions examples/Investigate_Imagenette.ipynb

Large diffs are not rendered by default.

92 changes: 32 additions & 60 deletions examples/Investigate_OxfordIIITPet.ipynb

Large diffs are not rendered by default.

46 changes: 36 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,22 +1,48 @@
[build-system]
requires = [ "setuptools>=42", "wheel",]
build-backend = "setuptools.build_meta"

[project]
name = "SelfClean"
description = "A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors."
authors = [
{ name = "Fabian Gröger", email = "[email protected]" },
]
version = "0.0.35"
name = "selfclean"
description = "A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates and label errors."
readme = "README.md"
keywords = [ "machine_learning", "data_cleaning", "datacentric_ai", "datacentric", "self-supervised learning",]
requires-python = ">=3.6"
classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent",]
dependencies = [ "SciencePlots", "black>=22.6", "codecov", "coverage>=6", "darglint>=1.8", "einops", "isort>=5.10", "jupyter", "loguru", "matplotlib", "memory-profiler", "numpy", "pandas", "pre-commit>=2.20", "pytest", "pytest-cov>=3", "scikit-image", "scikit_learn", "seaborn", "torchinfo", "torchmetrics", "torchvision", "tqdm", "transformers[torch]==4.27.4",]
[[project.authors]]
name = "Fabian Gröger"
email = "[email protected]"

[project.license]
text = "Attribution-NonCommercial 4.0 International"

[project.optional-dependencies]
approximate_nn = [ "annoy",]

[project.urls]
Homepage = "https://selfclean.github.io/"
"Source Code" = "https://github.com/Digital-Dermatology/SelfClean"

[tool.setuptools]
include-package-data = true

[tool.black]
include = '\.pyi?$'
include = "\\.pyi?$"

[tool.isort]
profile = "black"
skip_gitignore=true
py_version=39
skip_gitignore = true
py_version = 39
default_section = "THIRDPARTY"
known_thirdparty=["wandb"]
known_thirdparty = [ "wandb",]

[tool.pytest.ini_options]
# Set true to see logger ouput in test command line window
log_cli = false
log_cli_level = "INFO"
log_cli_format = "%(time)s :: %(name)s :: %(message)s"

[tool.setuptools.packages.find]
include = [ "selfclean*", "selfclean.*",]
exclude = [ "tests*", "tests.*", "*.tests", "*.tests.*", "*.tests*",]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ darglint>=1.8
isort>=5.10
pre-commit>=2.20
pytest-cov>=3
transformers
transformers[torch]==4.27.4
seaborn
SciencePlots
scikit-image
Expand Down
9 changes: 9 additions & 0 deletions selfclean/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
SelfClean.
A holistic self-supervised data cleaning strategy to detect off-topic samples, near duplicates and label errors.
"""

__author__ = "Fabian Groeger"

from .cleaner.selfclean import SelfClean # noqa: F401
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class AutoCleaningMixin:
def __init__(
self,
auto_cleaning: bool = False,
irrelevant_cut_off: float = 0.01,
off_topic_cut_off: float = 0.01,
near_duplicate_cut_off: float = 0.01,
label_error_cut_off: float = 0.01,
significance_level: float = 0.05,
Expand All @@ -28,7 +28,7 @@ def __init__(
):
super().__init__(**kwargs)
self.auto_cleaning = auto_cleaning
self.irrelevant_cut_off = irrelevant_cut_off
self.off_topic_cut_off = off_topic_cut_off
self.near_duplicate_cut_off = near_duplicate_cut_off
self.label_error_cut_off = label_error_cut_off
self.significance_level = significance_level
Expand Down Expand Up @@ -58,19 +58,19 @@ def perform_auto_cleaning(
)
return_dict["near_duplicates"]["auto_issues"] = issues_dup

# Irrelevant Samples
irrelevant_issues = issue_manger["irrelevants"]
if irrelevant_issues is not None:
# Off-Topic Samples
off_topic_issues = issue_manger["off_topic_samples"]
if off_topic_issues is not None:
if output_path is not None:
self.cleaner_kwargs["path"] = (
f"{output_path.stem}_auto_oods{output_path.suffix}"
)
self.cleaner_kwargs["alpha"] = self.irrelevant_cut_off
issues_ood = self.fraction_cut(
scores=irrelevant_issues["scores"],
self.cleaner_kwargs["alpha"] = self.off_topic_cut_off
issues_ot = self.fraction_cut(
scores=off_topic_issues["scores"],
**self.cleaner_kwargs,
)
return_dict["irrelevants"]["auto_issues"] = issues_ood
return_dict["off_topic_samples"]["auto_issues"] = issues_ot

# Label Errors
label_error_issues = issue_manger["label_errors"]
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@


class IssueTypes(Enum):
# NOTE: We leave the accessability of "off-topic-samples"
# via "irrelevants" to ensure backwards compatibility
IRRELEVANTS = "irrelevants"
OFF_TOPIC_SAMPLES = "off_topic_samples"
NEAR_DUPLICATES = "near_duplicates"
LABEL_ERRORS = "label_errors"

Expand All @@ -25,6 +28,10 @@ def get_issues(
if issue_type is type(IssueTypes):
issue_type = issue_type.value

# NOTE: Backwards compatibility with "irrelevants"
if issue_type == "irrelevants":
issue_type = IssueTypes.OFF_TOPIC_SAMPLES.value

sel_issues = self.issue_dict.get(issue_type, None)
if sel_issues is None:
return sel_issues
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np


class BaseIrrelevantMixin(ABC):
class BaseOffTopicMixin(ABC):
@abstractmethod
def get_irrelevant_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
raise NotImplementedError()
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,32 @@
import numpy as np
from scipy.cluster.hierarchy import single

from ...cleaner.irrelevants.base_irrelevant_mixin import BaseIrrelevantMixin
from ...cleaner.off_topic_samples.base_off_topic_mixin import BaseOffTopicMixin
from ...scoring.lad_scoring import LAD
from ...ssl_library.src.utils.logging import plot_dist


class LADIrrelevantMixin(BaseIrrelevantMixin):
class LADOffTopicMixin(BaseOffTopicMixin):
def __init__(self, global_leaves: bool = False, **kwargs):
super().__init__(**kwargs)
self.global_leaves = global_leaves

def get_irrelevant_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
# linkage_matrix: [idx1, idx2, dist, sample_count]
linkage_matrix = single(self.p_distances)
lad = LAD()
irrelevants = lad.calc_scores(
off_topic_samples = lad.calc_scores(
linkage_matrix=linkage_matrix,
global_leaves=self.global_leaves,
)
# free up allocated memory
del lad, linkage_matrix

if self.plot_distribution and irrelevants is not None:
if self.plot_distribution and off_topic_samples is not None:
plot_dist(
scores=np.asarray([x[0] for x in irrelevants]),
title="Distribution of irrelevant samples",
scores=np.asarray([x[0] for x in off_topic_samples]),
title="Distribution of off-topic samples",
)
irrelevant_scores = np.asarray([x[0] for x in irrelevants])
irrelevant_indices = np.asarray([x[1] for x in irrelevants])
return irrelevant_scores, irrelevant_indices
off_topic_scores = np.asarray([x[0] for x in off_topic_samples])
off_topic_indices = np.asarray([x[1] for x in off_topic_samples])
return off_topic_scores, off_topic_indices
30 changes: 30 additions & 0 deletions selfclean/cleaner/off_topic_samples/quantile_off_topic_mixin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Tuple

import numpy as np

from ...cleaner.off_topic_samples.base_off_topic_mixin import BaseOffTopicMixin
from ...ssl_library.src.utils.logging import plot_dist


class QuantileOffTopicMixin(BaseOffTopicMixin):
def __init__(self, quantile: float = 0.01, **kwargs):
super().__init__(**kwargs)
self.quantile = quantile

def get_off_topic_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
off_topic_samples = np.quantile(self.distance_matrix, self.quantile, axis=0)
off_topic_samples = [(off_topic_samples[i], i) for i in list(range(self.N))]
off_topic_samples = sorted(
off_topic_samples,
key=lambda tup: tup[0],
reverse=True,
)

if self.plot_distribution and off_topic_samples is not None:
plot_dist(
scores=np.asarray([x[0] for x in off_topic_samples]),
title="Distribution of off-topic samples",
)
off_topic_scores = np.asarray([x[0] for x in off_topic_samples])
off_topic_indices = np.asarray([x[1] for x in off_topic_samples])
return off_topic_scores, off_topic_indices
6 changes: 3 additions & 3 deletions src/cleaner/selfclean.py → selfclean/cleaner/selfclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def run_on_image_folder(
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.OFF_TOPIC_SAMPLES,
IssueTypes.LABEL_ERRORS,
],
# embedding
Expand Down Expand Up @@ -185,7 +185,7 @@ def run_on_dataset(
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.OFF_TOPIC_SAMPLES,
IssueTypes.LABEL_ERRORS,
],
# embedding
Expand Down Expand Up @@ -229,7 +229,7 @@ def _run(
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.OFF_TOPIC_SAMPLES,
IssueTypes.LABEL_ERRORS,
],
# embedding
Expand Down
Loading

0 comments on commit 1b4411b

Please sign in to comment.