Skip to content

Commit

Permalink
Merge pull request #6 from Digital-Dermatology/bug_fix/examples_downl…
Browse files Browse the repository at this point in the history
…oad_data

bug fix in examples not able to download data
  • Loading branch information
FabianGroeger96 authored May 1, 2024
2 parents c6906ac + 6e4c4d2 commit 4a1881f
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 38 deletions.
42 changes: 25 additions & 17 deletions examples/Investigate_Imagenette.ipynb

Large diffs are not rendered by default.

45 changes: 26 additions & 19 deletions examples/Investigate_OxfordIIITPet.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parse_requirements(filename):
name=PACKAGE_NAME,
packages=proj_packages,
package_dir={PACKAGE_NAME: SOURCE_DIRECTORY},
version="0.0.19",
version="0.0.22",
author="Fabian Groeger",
author_email="[email protected]",
description="A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors.",
Expand Down
2 changes: 1 addition & 1 deletion src/ssl_library
114 changes: 114 additions & 0 deletions src/utils/data_downloading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
import tarfile
from pathlib import Path
from typing import Union

import pandas as pd
import requests
from torchvision import datasets

OXFORD_PETS_URL = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz"
IMAGENETTE_URL = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz"

imagenette_labels = {
"n02979186": "cassette_player",
"n03417042": "garbage_truck",
"n01440764": "tench",
"n02102040": "English_springer",
"n03028079": "church",
"n03888257": "parachute",
"n03394916": "French_horn",
"n03000684": "chain_saw",
"n03445777": "golf_ball",
"n03425413": "gas_pump",
}


def class_name_from_file(img_path: str) -> str:
return "_".join(Path(img_path).stem.split("_")[:-1])


def get_oxford_pets3t(
root_path: Union[Path, str] = "oxford_pets3t",
return_dataframe: bool = False,
**kwargs,
):
root_path = Path(root_path)
if not (root_path / "images").exists():
root_path.mkdir(parents=True, exist_ok=True)
response = requests.get(OXFORD_PETS_URL, stream=True)
tar_path = root_path / "images.tar.gz"
with open(tar_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
with tarfile.open(tar_path) as tar:
tar.extractall(root_path)
os.remove(tar_path)
else:
print(f"Oxford PetIIIT already downloaded to `{root_path}`.")

dataset = datasets.ImageFolder(root=str(root_path), **kwargs)
classes = list(
set([class_name_from_file(samples[0]) for samples in dataset.samples])
)
class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}

targets = [class_to_idx.get(class_name_from_file(x[0])) for x in dataset.samples]
samples = [
(sample[0], new_target) for sample, new_target in zip(dataset.samples, targets)
]

dataset.classes = classes
dataset.class_to_idx = class_to_idx
dataset.targets = targets
dataset.samples = samples

if return_dataframe:
return create_dataframe_from_dataset(samples, dataset)
return dataset


def get_imagenette(
root_path: Union[Path, str] = "imagenette",
return_dataframe: bool = False,
**kwargs,
):
root_path = Path(root_path)
if not (root_path / "imagenette2-160").exists():
root_path.mkdir(parents=True, exist_ok=True)
response = requests.get(IMAGENETTE_URL, stream=True)
tar_path = root_path / "imagenette2-160.tgz"
with open(tar_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
with tarfile.open(tar_path) as tar:
tar.extractall(root_path)
os.remove(tar_path)
else:
print(f"ImageNette already downloaded to `{root_path}`.")

root_path = root_path / "imagenette2-160"
dataset = datasets.ImageFolder(root=str(root_path), **kwargs)
classes = list(set([samples[0].split("/")[4] for samples in dataset.samples]))
class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}

targets = [class_to_idx.get(x[0].split("/")[4]) for x in dataset.samples]
samples = [
(sample[0], new_target) for sample, new_target in zip(dataset.samples, targets)
]

dataset.classes = [imagenette_labels.get(x).lower() for x in classes]
dataset.class_to_idx = class_to_idx
dataset.targets = targets
dataset.samples = samples

if return_dataframe:
return create_dataframe_from_dataset(samples, dataset)
return dataset


def create_dataframe_from_dataset(samples, dataset):
df = pd.DataFrame(samples, columns=["img_path", "label"])
df["label_name"] = df["label"].apply(lambda x: dataset.classes[x])
df["img_path"] = df["img_path"].astype(str)
return dataset, df

0 comments on commit 4a1881f

Please sign in to comment.